Notes

Notes - notes.io

Certainly! Below is the complete implementation of the simplified Transformer model with all the missing components filled in:

```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_seq_length):
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.max_seq_length = max_seq_length
self.positional_encodings = self.generate_encodings()

def generate_encodings(self):
pe = torch.zeros(self.max_seq_length, self.d_model)
position = torch.arange(0, self.max_seq_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
return pe.unsqueeze(0)

def forward(self, x):
return x + self.positional_encodings[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.d_model = d_model
self.num_heads = num_heads
self.head_dim = d_model // num_heads

self.q_linear = nn.Linear(d_model, d_model)
self.k_linear = nn.Linear(d_model, d_model)
self.v_linear = nn.Linear(d_model, d_model)

self.final_linear = nn.Linear(d_model, d_model)

def forward(self, query, key, value, mask=None):
batch_size = query.size(0)

query = self.q_linear(query)
key = self.k_linear(key)
value = self.v_linear(value)

query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

scores = torch.matmul(query, key.transpose(-2, -1))
if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))

attention = F.softmax(scores, dim=-1)
weighted_sum = torch.matmul(attention, value)

weighted_sum = weighted_sum.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
output = self.final_linear(weighted_sum)
return output

class PositionwiseFeedForward(nn.Module):
def __init__(self, d_model, hidden_dim, drop_prob):
super(PositionwiseFeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, d_model)
self.dropout = nn.Dropout(drop_prob)

def forward(self, x):
x = self.linear1(x)
x = F.relu(x)
x = self.dropout(x)
x = self.linear2(x)
return x

class TransformerLayer(nn.Module):
def __init__(self, d_model, num_heads, ffn_hidden_dim, drop_prob):
super(TransformerLayer, self).__init__()
self.multihead_attention = MultiHeadAttention(d_model, num_heads)
self.layer_norm1 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(drop_prob)

self.feedforward = PositionwiseFeedForward(d_model, ffn_hidden_dim, drop_prob)
self.layer_norm2 = nn.LayerNorm(d_model)
self.dropout2 = nn.Dropout(drop_prob)

def forward(self, x, mask):
attention_output = self.multihead_attention(x, x, x, mask)
x = x + self.dropout1(attention_output)
x = self.layer_norm1(x)

feedforward_output = self.feedforward(x)
x = x + self.dropout2(feedforward_output)
x = self.layer_norm2(x)
return x

class Transformer(nn.Module):
def __init__(self, num_layers, d_model, num_heads, ffn_hidden_dim, input_vocab_size,
output_vocab_size, max_seq_length, drop_prob):
super(Transformer, self).__init__()
self.num_layers = num_layers
self.encoder = nn.Embedding(input_vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
self.transformer_layers = nn.ModuleList([
TransformerLayer(d_model, num_heads, ffn_hidden_dim, drop_prob) for _ in range(num_layers)
])
self.decoder = nn.Linear(d_model, output_vocab_size)

def forward(self, source, target, source_mask):
source_embedding = self.encoder(source) + self.positional_encoding(source)
x = source_embedding

for layer in self.transformer_layers:
x = layer(x, source_mask)

logits = self.decoder(x)
return logits

# Instantiate the Transformer model
num_layers = 6
d_model = 512
num_heads = 8
ffn_hidden_dim = 2048
input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)
max_seq_length = 300
drop_prob = 0.1

transformer = Transformer(num_layers, d_model, num_heads, ffn_hidden_dim, input_vocab_size,
output_vocab_size, max_seq_length, drop_prob)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform

Notes.io is a web-based application for taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000 notes created and continuing...

With notes.io;

* You can take a note from anywhere and any device with internet connection.
* You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
* You can quickly share your contents without website, blog and e-mail.
* You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
* Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.

Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.

Easy: Notes.io doesn’t require installation. Just write and share note!

Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )

Free: Notes.io works for 12 years and has been free since the day it was started.

You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;

Email: [email protected]

Twitter: http://twitter.com/notesio

Instagram: http://instagram.com/notes.io

Facebook: http://facebook.com/notesio

Regards;
Notes.io Team

Notes

Notes - notes.io

Shortened Note Link

Long File

Notes