Transformer模型介绍

JS滚动广告
✨ AI前沿 | 每日人工智能最新资讯 ✨

V = self.split_heads(V)
scores = torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attn_weights = F.softmax(scores, dim=-1)
attn_output = torch.matmul(attn_weights, V)
attn_output = self.combine_heads(attn_output)
attn_output = self.W_O(attn_output)
return attn_output
def split_heads(self, x):
batch_size, seq_len, d_model = x.size()
x = x.view(batch_size, seq_len, self.n_heads, self.d_k)
return x.transpose(1, 2)
def combine_heads(self, x):
batch_size, n_heads, seq_len, d_v = x.size()
x = x.transpose(1, 2).contiguous()
x = x.view(batch_size, seq_len, n_heads * d_v)
return x
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.dropout = nn.Dropout(p=0.1)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe :, 0::2] = torch.sin(position * div_term)
pe :, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer(‘pe’, pe)
def forward(self, x):
x = x * math.sqrt(self.d_model)
x = x + self.pe :, :x.size(1)]
x = self.dropout(x)
return x
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = self.linear1(x)
x = F.relu(x)
x = self.linear2(x)
return x
class EncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff):
super(EncoderLayer, self).__init__()
self.multi_head_attn = MultiHeadAttention(d_model, n_heads)
self.feed_forward = FeedForward(d_model, d_ff)
self.layer_norm1 = nn.LayerNorm(d_model)
self.layer_norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(p=0.1)
self.dropout2 = nn.Dropout(p=0.1)
def forward(self, x, mask=None):
attn_output = self.multi_head_attn(x, x, x, mask=mask)
x = x + self.dropout1(attn_output)
x = self.layer_norm1(x)
ff_output = self.feed_forward(x)
x = x + self.dropout2(ff_output)
x = self.layer_norm2(x)
return x
class Encoder(nn.Module):
def __init__(self, input_size, d_model, n_heads, d_ff, n_layers):
super(Encoder, self).__init__()
self.embedding = nn.Embedding(input_size, d_model)
self.pos_encoding = PositionalEncoding(d_model)
self.layers = nn.ModuleList( EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)])
self.layer_norm = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
x = self.embedding(x)
x = self.pos_encoding(x)
for layer in self.layers:
x = layer(x, mask=mask)
x = self.layer_norm(x)
return x
class Transformer(nn.Module):
def __init__(self, input_size, output_size, d_model, n_heads, d_ff, n_layers):
super(Transformer, self).__init__()
self.encoder = Encoder(input_size, d_model, n_heads, d_ff, n_layers)
self.output_layer = nn.Linear(d_model, output_size)
def forward(self, x, mask=None):
x = self.encoder(x, mask)
x = x :, 0, :]
x = self.output_layer(x)
return x

发表评论