Diff
checker
Texto
Texto
Imagens
Documentos
Excel
Pastas
Legal
Enterprise
Aplicativo para desktop
Preços
Fazer login
Baixar o Diffchecker Desktop
Comparar texto
Encontre a diferença entre dois arquivos de texto
Ferramentas
Histórico
Editor live
Recolher inalteradas
Sem quebra de linha
Layout
Dividido
Unificado
Nível de detalhe
Inteligente
Palavra
Caractere
Realce de sintaxe
Escolher sintaxe
Ignorar
Transformar texto
Ir à primeira mudança
Editar entrada
Diffchecker Desktop
A maneira mais segura de usar o Diffchecker. Obtenha o aplicativo Diffchecker Desktop: seus diffs nunca saem do seu computador!
Obter Desktop
GPT-2
Criado
há 5 anos
O diff nunca expira
Limpar
Exportar
Compartilhar
Explicar
39 remoções
Linhas
Total
Removido
Caracteres
Total
Removido
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
229 linhas
Copiar tudo
100 adições
Linhas
Total
Adicionado
Caracteres
Total
Adicionado
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
273 linhas
Copiar tudo
# coding=utf-8
# coding=utf-8
Copiar
Copiado
Copiar
Copiado
#
# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""GPT-2 model."""
"""GPT-2 model."""
import torch
import torch
from megatron import get_args
from megatron import get_args
Copiar
Copiado
Copiar
Copiado
from megatron import mpu
from megatron.module import MegatronModule
from megatron.module import MegatronModule
Copiar
Copiado
Copiar
Copiado
from functools import partial
from .language_model import parallel_lm_logits
from .language_model import parallel_lm_logits
from .language_model import get_language_model
from .language_model import get_language_model
from .utils import init_method_normal
from .utils import init_method_normal
from .utils import scaled_init_method_normal
from .utils import scaled_init_method_normal
Copiar
Copiado
Copiar
Copiado
from .norms import LayerNorm, RMSNorm, ScaleNorm
# Pipeline parallelism
# Pipeline parallelism
from megatron import mpu
from megatron import mpu
Copiar
Copiado
Copiar
Copiado
import torch.nn.functional as F
from
megatron.mpu
import
ParallelRelativePositionBias
import torch.nn.functional as F
from
apex.normalization.fused_layer_norm
import
FusedLayerNorm as LayerNorm
import megatron.fp16 as fp16
import megatron.fp16 as fp16
from megatron.model.transformer import ParallelTransformerLayerPipe
from megatron.model.transformer import ParallelTransformerLayerPipe
from .language_model import EmbeddingPipe
from .language_model import EmbeddingPipe
from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
def gpt2_attention_mask_func(attention_scores, ltor_mask):
def gpt2_attention_mask_func(attention_scores, ltor_mask):
attention_scores.masked_fill_(ltor_mask, -10000.0)
attention_scores.masked_fill_(ltor_mask, -10000.0)
return attention_scores
return attention_scores
Copiar
Copiado
Copiar
Copiado
def
C
ross
E
ntropy(output, labels
):
def
c
ross
_e
ntropy(output, labels
, _fp16=False
):
""" From pretrain_gpt2:forward_step() """
""" From pretrain_gpt2:forward_step() """
labels, loss_mask = labels[0], labels[1]
labels, loss_mask = labels[0], labels[1]
Copiar
Copiado
Copiar
Copiado
if _fp16:
losses = mpu.vocab_parallel_cross_entropy(output.contiguous()
.float
(), labels)
assert output.dtype == torch.half
losses = mpu.vocab_parallel_cross_entropy(output.contiguous()
, labels)
else:
output = fp16.fp16_to_fp32(output)
losses = mpu.vocab_parallel_cross_entropy(output.contiguous
(), labels)
loss_mask = loss_mask.view(-1)
loss_mask = loss_mask.view(-1)
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
return loss
return loss
Copiar
Copiado
Copiar
Copiado
class GPT2Model(MegatronModule):
class GPT2Model(MegatronModule):
"""GPT-2 Language model."""
"""GPT-2 Language model."""
def __init__(self, num_tokentypes=0, parallel_output=True):
def __init__(self, num_tokentypes=0, parallel_output=True):
super(GPT2Model, self).__init__()
super(GPT2Model, self).__init__()
args = get_args()
args = get_args()
Copiar
Copiado
Copiar
Copiado
self.weight_tying = not args.no_weight_tying
self.parallel_output = parallel_output
self.parallel_output = parallel_output
self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
self.language_model, self._language_model_key = get_language_model(
self.language_model, self._language_model_key = get_language_model(
attention_mask_func=gpt2_attention_mask_func,
attention_mask_func=gpt2_attention_mask_func,
num_tokentypes=num_tokentypes,
num_tokentypes=num_tokentypes,
add_pooler=False,
add_pooler=False,
init_method=init_method_normal(args.init_method_std),
init_method=init_method_normal(args.init_method_std),
scaled_init_method=scaled_init_method_normal(args.init_method_std,
scaled_init_method=scaled_init_method_normal(args.init_method_std,
args.num_layers))
args.num_layers))
def forward(self, input_ids, position_ids, attention_mask, labels=None,
def forward(self, input_ids, position_ids, attention_mask, labels=None,
tokentype_ids=None, layer_past=None, get_key_value=False,
tokentype_ids=None, layer_past=None, get_key_value=False,
forward_method_parallel_output=None):
forward_method_parallel_output=None):
# Language model.
# Language model.
lm_output = self.language_model(input_ids,
lm_output = self.language_model(input_ids,
position_ids,
position_ids,
attention_mask,
attention_mask,
tokentype_ids=tokentype_ids,
tokentype_ids=tokentype_ids,
layer_past=layer_past,
layer_past=layer_past,
get_key_value=get_key_value)
get_key_value=get_key_value)
if get_key_value:
if get_key_value:
lm_output, presents = lm_output
lm_output, presents = lm_output
# Output.
# Output.
parallel_output = self.parallel_output
parallel_output = self.parallel_output
if forward_method_parallel_output is not None:
if forward_method_parallel_output is not None:
parallel_output = forward_method_parallel_output
parallel_output = forward_method_parallel_output
Copiar
Copiado
Copiar
Copiado
output = parallel_lm_logits(
if self.weight_tying:
lm_output,
output = parallel_lm_logits(
self.language_model.embedding.word_embeddings.weight,
lm_output,
parallel_output)
self.language_model.embedding.word_embeddings.weight,
parallel_output)
else:
output = parallel_lm_logits(
lm_output,
None,
parallel_output, weight_tying=False)
if get_key_value:
if get_key_value:
output = [output, presents]
output = [output, presents]
if labels is None:
if labels is None:
return output
return output
else:
else:
if self.fp16_lm_cross_entropy:
if self.fp16_lm_cross_entropy:
assert output.dtype == torch.half
assert output.dtype == torch.half
loss = mpu.vocab_parallel_cross_entropy(output, labels)
loss = mpu.vocab_parallel_cross_entropy(output, labels)
else:
else:
loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
return loss
return loss
Copiar
Copiado
Copiar
Copiado
def state_dict_for_save_checkpoint(self, destination=None, prefix='',
def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False):
keep_vars=False):
state_dict_ = {}
state_dict_ = {}
state_dict_[self._language_model_key] \
state_dict_[self._language_model_key] \
= self.language_model.state_dict_for_save_checkpoint(
= self.language_model.state_dict_for_save_checkpoint(
Copiar
Copiado
Copiar
Copiado
destination, prefix, keep_vars)
destination, prefix, keep_vars)
return state_dict_
return state_dict_
def load_state_dict(self, state_dict, strict=True):
def load_state_dict(self, state_dict, strict=True):
"""Customized load."""
"""Customized load."""
if self._language_model_key in state_dict:
if self._language_model_key in state_dict:
state_dict = state_dict[self._language_model_key]
state_dict = state_dict[self._language_model_key]
self.language_model.load_state_dict(state_dict, strict=strict)
self.language_model.load_state_dict(state_dict, strict=strict)
Copiar
Copiado
Copiar
Copiado
class GPT2ModelPipe(PipelineModule,
MegatronModule):
class GPT2ModelPipe(PipelineModule,
MegatronModule):
"""GPT2Model adapted for pipeline parallelism.
"""GPT2Model adapted for pipeline parallelism.
The largest change is flattening the GPTModel class so we can express it as a
The largest change is flattening the GPTModel class so we can express it as a
sequence of layers including embedding, transformer layers, and output.
sequence of layers including embedding, transformer layers, and output.
"""
"""
Copiar
Copiado
Copiar
Copiado
def __init__(self, num_tokentypes=0, parallel_output=True,
add_pooler=False,
topology=None):
def __init__(self, num_tokentypes=0, parallel_output=True,
topology=None):
args = get_args()
args = get_args()
self.parallel_output = parallel_output
self.parallel_output = parallel_output
self.hidden_size = args.hidden_size
self.hidden_size = args.hidden_size
self.num_tokentypes = num_tokentypes
self.num_tokentypes = num_tokentypes
self.init_method = init_method_normal(args.init_method_std)
self.init_method = init_method_normal(args.init_method_std)
self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers)
Copiar
Copiado
Copiar
Copiado
self.add_pooler = add_pooler
weight_tying = not args.no_weight_tying
if self.add_pooler:
if args.pos_emb == 'rpe':
raise NotImplementedError('Pipeline pooler not yet implemented. Forward needs pooling_sequence_index')
rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets, max_distance=args.rpe_max_distance,
heads=args.num_attention_heads)
else:
rpe_emb = None
self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
Copiar
Copiado
Copiar
Copiado
# Use torch gelu unless otherwise forced.
gelu = F.gelu
if args.openai_gelu:
gelu = openai_gelu
#
#
# forward() prototype
# forward() prototype
#
#
self.specs = []
self.specs = []
Copiar
Copiado
Copiar
Copiado
# Embedding layer
# Embedding layer
Copiar
Copiado
Copiar
Copiado
self.specs.append(TiedLayerSpec('embed',
if weight_tying:
EmbeddingPipe,
self.specs.append(TiedLayerSpec('embed',
EmbeddingPipe,
self.hidden_size,
args.padded_vocab_size,
args.max_position_embeddings,
args.hidden_dropout,
self.init_method,
self.num_tokentypes,
tied_weight_attr='word_embeddings_weight'))
else:
self.specs.append(LayerSpec(
EmbeddingPipe,
self.hidden_size,
self.hidden_size,
args.padded_vocab_size,
args.padded_vocab_size,
args.max_position_embeddings,
args.max_position_embeddings,
args.hidden_dropout,
args.hidden_dropout,
self.init_method,
self.init_method,
Copiar
Copiado
Copiar
Copiado
self.num_tokentypes
,
self.num_tokentypes
))
tied_weight_attr='word_embeddings_weight'
))
# outputs are now (hidden_states, attention_mask)
# outputs are now (hidden_states, attention_mask)
# data format change to avoid explicit tranposes : [b s h] --> [s b h]
# data format change to avoid explicit tranposes : [b s h] --> [s b h]
Copiar
Copiado
Copiar
Copiado
self.specs.append(lambda x: (x[0].transpose(0,
1).contiguous(), x[1]))
self.specs.append(lambda x: (x[0].transpose(0,
1).contiguous(), x[1]))
# Transformer layers
# Transformer layers
for x in range(args.num_layers):
for x in range(args.num_layers):
Copiar
Copiado
Copiar
Copiado
if args.sparsity == 'none':
sparse = False
elif args.sparsity == 'all':
sparse = True
elif args.sparsity == 'interspersed':
sparse = not x % 2 == 0
self.specs.append(
self.specs.append(
LayerSpec(ParallelTransformerLayerPipe,
LayerSpec(ParallelTransformerLayerPipe,
attention_mask_func=gpt2_attention_mask_func,
attention_mask_func=gpt2_attention_mask_func,
init_method=self.init_method,
init_method=self.init_method,
output_layer_init_method=self.output_layer_init_method,
output_layer_init_method=self.output_layer_init_method,
Copiar
Copiado
Copiar
Copiado
layer_number=x
))
layer_number=x
,
sparse=sparse,
rpe=rpe_emb
))
# Undo data format change and drop mask
# Undo data format change and drop mask
Copiar
Copiado
Copiar
Copiado
self.specs.append(lambda x: x[0].transpose(0,
1).contiguous())
self.specs.append(lambda x: x[0].transpose(0,
1).contiguous())
# Final layernorm after transformer layers
# Final layernorm after transformer layers
Copiar
Copiado
Copiar
Copiado
if args.norm == "rmsnorm":
norm = RMSNorm
eps = args.rms_norm_epsilon
elif args.norm == "layernorm":
eps = args.layernorm_epsilon
norm = LayerNorm
elif args.norm == "scalenorm":
eps = args.scalenorm_epsilon
norm = ScaleNorm
self.specs.append(
self.specs.append(
Copiar
Copiado
Copiar
Copiado
LayerSpec(
LayerNorm
,
LayerSpec(
norm
,
args.hidden_size,
args.hidden_size,
Copiar
Copiado
Copiar
Copiado
eps=
args.layernorm_epsilon
))
eps=
eps
))
# XXX forward_method_parallel_output is assumed to be None, but we're not in a
# XXX forward_method_parallel_output is assumed to be None, but we're not in a
# fwd method to assert
# fwd method to assert
def _logits_helper(embedding, lm_output):
def _logits_helper(embedding, lm_output):
"""Just a wrapper to massage inputs/outputs from pipeline. """
"""Just a wrapper to massage inputs/outputs from pipeline. """
return parallel_lm_logits(
return parallel_lm_logits(
lm_output,
lm_output,
embedding.word_embeddings_weight,
embedding.word_embeddings_weight,
self.parallel_output)
self.parallel_output)
Copiar
Copiado
Copiar
Copiado
self.specs.append(
if weight_tying:
TiedLayerSpec('embed',
self.specs.append(
EmbeddingPipe,
TiedLayerSpec('embed',
self.hidden_size,
EmbeddingPipe,
args.padded_vocab_size,
self.hidden_size,
args.max_position_embeddings,
args.padded_vocab_size,
args.hidden_dropout,
args.max_position_embeddings,
self.init_method,
args.hidden_dropout,
self.num_tokentypes,
self.init_method,
forward_fn=_logits_helper,
self.num_tokentypes,
tied_weight_attr='word_embeddings_weight')
forward_fn=_logits_helper,
)
tied_weight_attr='word_embeddings_weight')
)
# Should maybe be done in loss_fn() instead?
else:
if args.fp16:
self.specs.append(
self.specs.append(fp16.fp16_to_fp32)
LayerSpec(
mpu.RowParallelLinear,
args.hidden_size,
args.padded_vocab_size,
bias=False,
input_is_parallel=False,
parallel_output=True,
skip_bias_add=False
)
)
self.specs.append(lambda x: x[0]) # drop bias
Copiar
Copiado
Copiar
Copiado
loss_fn = partial(cross_entropy, _fp16=self.fp16_lm_cross_entropy)
if args.checkpoint_activations:
if args.checkpoint_activations:
interval = args.checkpoint_num_layers
interval = args.checkpoint_num_layers
else:
else:
interval = 0
interval = 0
super().__init__(layers=self.specs,
super().__init__(layers=self.specs,
Copiar
Copiado
Copiar
Copiado
loss_fn=
CrossEntropy
,
loss_fn=
loss_fn
,
topology=topology,
topology=topology,
activation_checkpoint_interval=interval,
activation_checkpoint_interval=interval,
partition_method='type:transformer')
partition_method='type:transformer')
Diferenças salvas
Texto original
Abrir arquivo
# coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """GPT-2 model.""" import torch from megatron import get_args from megatron import mpu from megatron.module import MegatronModule from .language_model import parallel_lm_logits from .language_model import get_language_model from .utils import init_method_normal from .utils import scaled_init_method_normal # Pipeline parallelism from megatron import mpu import torch.nn.functional as F import torch.nn.functional as F from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm import megatron.fp16 as fp16 from megatron.model.transformer import ParallelTransformerLayerPipe from .language_model import EmbeddingPipe from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores def CrossEntropy(output, labels): """ From pretrain_gpt2:forward_step() """ labels, loss_mask = labels[0], labels[1] losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels) loss_mask = loss_mask.view(-1) loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() return loss class GPT2Model(MegatronModule): """GPT-2 Language model.""" def __init__(self, num_tokentypes=0, parallel_output=True): super(GPT2Model, self).__init__() args = get_args() self.parallel_output = parallel_output self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.language_model, self._language_model_key = get_language_model( attention_mask_func=gpt2_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False, init_method=init_method_normal(args.init_method_std), scaled_init_method=scaled_init_method_normal(args.init_method_std, args.num_layers)) def forward(self, input_ids, position_ids, attention_mask, labels=None, tokentype_ids=None, layer_past=None, get_key_value=False, forward_method_parallel_output=None): # Language model. lm_output = self.language_model(input_ids, position_ids, attention_mask, tokentype_ids=tokentype_ids, layer_past=layer_past, get_key_value=get_key_value) if get_key_value: lm_output, presents = lm_output # Output. parallel_output = self.parallel_output if forward_method_parallel_output is not None: parallel_output = forward_method_parallel_output output = parallel_lm_logits( lm_output, self.language_model.embedding.word_embeddings.weight, parallel_output) if get_key_value: output = [output, presents] if labels is None: return output else: if self.fp16_lm_cross_entropy: assert output.dtype == torch.half loss = mpu.vocab_parallel_cross_entropy(output, labels) else: loss = mpu.vocab_parallel_cross_entropy(output.float(), labels) return loss def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False): state_dict_ = {} state_dict_[self._language_model_key] \ = self.language_model.state_dict_for_save_checkpoint( destination, prefix, keep_vars) return state_dict_ def load_state_dict(self, state_dict, strict=True): """Customized load.""" if self._language_model_key in state_dict: state_dict = state_dict[self._language_model_key] self.language_model.load_state_dict(state_dict, strict=strict) class GPT2ModelPipe(PipelineModule,MegatronModule): """GPT2Model adapted for pipeline parallelism. The largest change is flattening the GPTModel class so we can express it as a sequence of layers including embedding, transformer layers, and output. """ def __init__(self, num_tokentypes=0, parallel_output=True, add_pooler=False, topology=None): args = get_args() self.parallel_output = parallel_output self.hidden_size = args.hidden_size self.num_tokentypes = num_tokentypes self.init_method = init_method_normal(args.init_method_std) self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.add_pooler = add_pooler if self.add_pooler: raise NotImplementedError('Pipeline pooler not yet implemented. Forward needs pooling_sequence_index') # Use torch gelu unless otherwise forced. gelu = F.gelu if args.openai_gelu: gelu = openai_gelu # # forward() prototype # self.specs = [] # Embedding layer self.specs.append(TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr='word_embeddings_weight')) # outputs are now (hidden_states, attention_mask) # data format change to avoid explicit tranposes : [b s h] --> [s b h] self.specs.append(lambda x: (x[0].transpose(0,1).contiguous(), x[1])) # Transformer layers for x in range(args.num_layers): self.specs.append( LayerSpec(ParallelTransformerLayerPipe, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=x)) # Undo data format change and drop mask self.specs.append(lambda x: x[0].transpose(0,1).contiguous()) # Final layernorm after transformer layers self.specs.append( LayerSpec(LayerNorm, args.hidden_size, eps=args.layernorm_epsilon)) # XXX forward_method_parallel_output is assumed to be None, but we're not in a # fwd method to assert def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ return parallel_lm_logits( lm_output, embedding.word_embeddings_weight, self.parallel_output) self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight') ) # Should maybe be done in loss_fn() instead? if args.fp16: self.specs.append(fp16.fp16_to_fp32) if args.checkpoint_activations: interval = args.checkpoint_num_layers else: interval = 0 super().__init__(layers=self.specs, loss_fn=CrossEntropy, topology=topology, activation_checkpoint_interval=interval, partition_method='type:transformer')
Texto alterado
Abrir arquivo
# coding=utf-8 # # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. # # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """GPT-2 model.""" import torch from megatron import get_args from megatron.module import MegatronModule from functools import partial from .language_model import parallel_lm_logits from .language_model import get_language_model from .utils import init_method_normal from .utils import scaled_init_method_normal from .norms import LayerNorm, RMSNorm, ScaleNorm # Pipeline parallelism from megatron import mpu from megatron.mpu import ParallelRelativePositionBias import megatron.fp16 as fp16 from megatron.model.transformer import ParallelTransformerLayerPipe from .language_model import EmbeddingPipe from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores def cross_entropy(output, labels, _fp16=False): """ From pretrain_gpt2:forward_step() """ labels, loss_mask = labels[0], labels[1] if _fp16: assert output.dtype == torch.half losses = mpu.vocab_parallel_cross_entropy(output.contiguous(), labels) else: output = fp16.fp16_to_fp32(output) losses = mpu.vocab_parallel_cross_entropy(output.contiguous(), labels) loss_mask = loss_mask.view(-1) loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() return loss class GPT2Model(MegatronModule): """GPT-2 Language model.""" def __init__(self, num_tokentypes=0, parallel_output=True): super(GPT2Model, self).__init__() args = get_args() self.weight_tying = not args.no_weight_tying self.parallel_output = parallel_output self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.language_model, self._language_model_key = get_language_model( attention_mask_func=gpt2_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False, init_method=init_method_normal(args.init_method_std), scaled_init_method=scaled_init_method_normal(args.init_method_std, args.num_layers)) def forward(self, input_ids, position_ids, attention_mask, labels=None, tokentype_ids=None, layer_past=None, get_key_value=False, forward_method_parallel_output=None): # Language model. lm_output = self.language_model(input_ids, position_ids, attention_mask, tokentype_ids=tokentype_ids, layer_past=layer_past, get_key_value=get_key_value) if get_key_value: lm_output, presents = lm_output # Output. parallel_output = self.parallel_output if forward_method_parallel_output is not None: parallel_output = forward_method_parallel_output if self.weight_tying: output = parallel_lm_logits( lm_output, self.language_model.embedding.word_embeddings.weight, parallel_output) else: output = parallel_lm_logits( lm_output, None, parallel_output, weight_tying=False) if get_key_value: output = [output, presents] if labels is None: return output else: if self.fp16_lm_cross_entropy: assert output.dtype == torch.half loss = mpu.vocab_parallel_cross_entropy(output, labels) else: loss = mpu.vocab_parallel_cross_entropy(output.float(), labels) return loss def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False): state_dict_ = {} state_dict_[self._language_model_key] \ = self.language_model.state_dict_for_save_checkpoint( destination, prefix, keep_vars) return state_dict_ def load_state_dict(self, state_dict, strict=True): """Customized load.""" if self._language_model_key in state_dict: state_dict = state_dict[self._language_model_key] self.language_model.load_state_dict(state_dict, strict=strict) class GPT2ModelPipe(PipelineModule, MegatronModule): """GPT2Model adapted for pipeline parallelism. The largest change is flattening the GPTModel class so we can express it as a sequence of layers including embedding, transformer layers, and output. """ def __init__(self, num_tokentypes=0, parallel_output=True, topology=None): args = get_args() self.parallel_output = parallel_output self.hidden_size = args.hidden_size self.num_tokentypes = num_tokentypes self.init_method = init_method_normal(args.init_method_std) self.output_layer_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) weight_tying = not args.no_weight_tying if args.pos_emb == 'rpe': rpe_emb = ParallelRelativePositionBias(causal=True, num_buckets=args.rpe_num_buckets, max_distance=args.rpe_max_distance, heads=args.num_attention_heads) else: rpe_emb = None self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy # # forward() prototype # self.specs = [] # Embedding layer if weight_tying: self.specs.append(TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, tied_weight_attr='word_embeddings_weight')) else: self.specs.append(LayerSpec(EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes)) # outputs are now (hidden_states, attention_mask) # data format change to avoid explicit tranposes : [b s h] --> [s b h] self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), x[1])) # Transformer layers for x in range(args.num_layers): if args.sparsity == 'none': sparse = False elif args.sparsity == 'all': sparse = True elif args.sparsity == 'interspersed': sparse = not x % 2 == 0 self.specs.append( LayerSpec(ParallelTransformerLayerPipe, attention_mask_func=gpt2_attention_mask_func, init_method=self.init_method, output_layer_init_method=self.output_layer_init_method, layer_number=x, sparse=sparse, rpe=rpe_emb)) # Undo data format change and drop mask self.specs.append(lambda x: x[0].transpose(0, 1).contiguous()) # Final layernorm after transformer layers if args.norm == "rmsnorm": norm = RMSNorm eps = args.rms_norm_epsilon elif args.norm == "layernorm": eps = args.layernorm_epsilon norm = LayerNorm elif args.norm == "scalenorm": eps = args.scalenorm_epsilon norm = ScaleNorm self.specs.append( LayerSpec(norm, args.hidden_size, eps=eps)) # XXX forward_method_parallel_output is assumed to be None, but we're not in a # fwd method to assert def _logits_helper(embedding, lm_output): """Just a wrapper to massage inputs/outputs from pipeline. """ return parallel_lm_logits( lm_output, embedding.word_embeddings_weight, self.parallel_output) if weight_tying: self.specs.append( TiedLayerSpec('embed', EmbeddingPipe, self.hidden_size, args.padded_vocab_size, args.max_position_embeddings, args.hidden_dropout, self.init_method, self.num_tokentypes, forward_fn=_logits_helper, tied_weight_attr='word_embeddings_weight') ) else: self.specs.append( LayerSpec( mpu.RowParallelLinear, args.hidden_size, args.padded_vocab_size, bias=False, input_is_parallel=False, parallel_output=True, skip_bias_add=False ) ) self.specs.append(lambda x: x[0]) # drop bias loss_fn = partial(cross_entropy, _fp16=self.fp16_lm_cross_entropy) if args.checkpoint_activations: interval = args.checkpoint_num_layers else: interval = 0 super().__init__(layers=self.specs, loss_fn=loss_fn, topology=topology, activation_checkpoint_interval=interval, partition_method='type:transformer')
Encontrar Diferença