lab9: Implement skip gram model to predict words within a certain range before and after the current word.

from nltk.corpus import gutenberg # to get bible corpus
from string import punctuation # to remove punctuation from corpus
import nltk
import re
import numpy as np
from keras.preprocessing import text
from keras.preprocessing.sequence import skipgrams
from keras.layers import *
from keras.layers.core import Dense, Reshape
from keras.layers import Embedding
from keras.models import Model,Sequential'gutenberg')'punkt')'stopwords')
stop_words = nltk.corpus.stopwords.words('english')
bible = gutenberg.sents("bible-kjv.txt")
remove_terms = punctuation + '0123456789'
wpt = nltk.WordPunctTokenizer()
def normalize_document(doc):
# lower case and remove special characterswhitespaces
doc = re.sub(r'[^a-zA-Zs]', '', doc,re.I|re.A)
doc = doc.lower()
doc = doc.strip()
# tokenize document
tokens = wpt.tokenize(doc)
# filter stopwords out of document
filtered_tokens = [token for token in tokens if token not in stop_words]
# re-create document from filtered tokens
doc = ' '.join(filtered_tokens)
return doc

normalize_corpus = np.vectorize(normalize_document)
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]
# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
id2word[pairs[i][0]], pairs[i][0],
id2word[pairs[i][1]], pairs[i][1],
word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id) + 1
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:5])
# build skip-gram architecture
embed_size = 100
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
merged_output = add([word_model.output, context_model.output])
model_combined = Sequential()
model_combined.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
final_model = Model([word_model.input, context_model.input], model_combined(merged_output))
final_model.compile(loss="mean_squared_error", optimizer="rmsprop")
# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(final_model, show_shapes=True, show_layer_names=False,
rankdir='TB').create(prog='dot', format='svg'))
for epoch in range(1, 3):
loss = 0
for i, elem in enumerate(skip_grams):
pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
labels = np.array(elem[1], dtype='int32')
X = [pair_first_elem, pair_second_elem]
Y = labels
if i % 10000 == 0:
print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
loss += final_model.train_on_batch(X,Y)

print('Epoch:', epoch, 'Loss:', loss)
from sklearn.metrics.pairwise import euclidean_distances
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0][1:]

distance_matrix = euclidean_distances(weights)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
for search_term in ['god', 'jesus','egypt', 'john', 'famine']}


Lab11: Building a RNN to perform Character level language modeling.

import numpy as np
import matplotlib.pyplot as plt

def initialize_parameters(vocab_size, hidden_layer_size):

parameters = {}
parameters["Whh"] = np.random.randn(
hidden_layer_size, hidden_layer_size) * 0.01
parameters["Wxh"] = np.random.randn(hidden_layer_size, vocab_size) * 0.01
parameters["b"] = np.zeros((hidden_layer_size, 1))
parameters["Why"] = np.random.randn(vocab_size, hidden_layer_size) * 0.01
parameters["c"] = np.zeros((vocab_size, 1))

return parameters

def initialize_adam(parameters):

parameters_names = ["Whh", "Wxh", "b", "Why", "c"]
v = {}
s = {}

for param_name in parameters_names:
v["d" + param_name] = np.zeros_like(parameters[param_name])
s["d" + param_name] = np.zeros_like(parameters[param_name])

return v, s

def initialize_rmsprop(parameters):

parameters_names = ["Whh", "Wxh", "b", "Why", "c"]
s = {}

for param_name in parameters_names:
s["d" + param_name] = np.zeros_like(parameters[param_name])

return s

def softmax(z):

e_z = np.exp(z)
probs = e_z / np.sum(e_z)

return probs

def rnn_forward(x, y, h_prev, parameters):

# Retrieve parameters
Wxh, Whh, b = parameters["Wxh"], parameters["Whh"], parameters["b"]
Why, c = parameters["Why"], parameters["c"]

# Initialize inputs, hidden state, output, and probabilities dictionaries
xs, hs, os, probs = {}, {}, {}, {}

# Initialize x0 to zero vector
xs[0] = np.zeros((vocab_size, 1))

# Initialize loss and assigns h_prev to last hidden state in hs
loss = 0
hs[-1] = np.copy(h_prev)

# Forward pass: loop over all characters of the name
for t in range(len(x)):
# Convert to one-hot vector
if t > 0:
xs[t] = np.zeros((vocab_size, 1))
xs[t][x[t]] = 1
# Hidden state
hs[t] = np.tanh(, xs[t]) +, hs[t - 1]) + b)
# Logits
os[t] =, hs[t]) + c
# Probs
probs[t] = softmax(os[t])
# Loss
loss -= np.log(probs[t][y[t], 0])

cache = (xs, hs, probs)

return loss, cache

def smooth_loss(loss, current_loss):

return 0.999 * loss + 0.001 * current_loss

def clip_gradients(gradients, max_value):

for grad in gradients.keys():
np.clip(gradients[grad], -max_value, max_value, out=gradients[grad])

return gradients

def rnn_backward(y, parameters, cache):

# Retrieve xs, hs, and probs
xs, hs, probs = cache

# Initialize all gradients to zero
dh_next = np.zeros_like(hs[0])

parameters_names = ["Whh", "Wxh", "b", "Why", "c"]
grads = {}
for param_name in parameters_names:
grads["d" + param_name] = np.zeros_like(parameters[param_name])

# Iterate over all time steps in reverse order starting from Tx
for t in reversed(range(len(xs))):
dy = np.copy(probs[t])
dy[y[t]] -= 1
grads["dWhy"] +=, hs[t].T)
grads["dc"] += dy
dh =["Why"].T, dy) + dh_next
dhraw = (1 - hs[t] ** 2) * dh
grads["dWhh"] +=, hs[t - 1].T)
grads["dWxh"] +=, xs[t].T)
grads["db"] += dhraw
dh_next =["Whh"].T, dhraw)
# Clip the gradients using [-5, 5] as the interval
grads = clip_gradients(grads, 5)
# Get the last hidden state
h_prev = hs[len(xs) - 1]

return grads, h_prev

def update_parameters_with_adam(
parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999,

parameters_names = ["Whh", "Wxh", "b", "Why", "c"]
v_corrected = {}
s_corrected = {}

for param_name in parameters_names:
# Update the moving average of first gradient and squared gradient
v["d" + param_name] = beta1 * v["d" + param_name] +
(1 - beta1) * grads["d" + param_name]
s["d" + param_name] = beta2 * s["d" + param_name] +
(1 - beta2) * np.square(grads["d" + param_name])

# Compute the corrected-bias estimate of the moving averages
v_corrected["d" + param_name] = v["d" + param_name] / (1 - beta1**t)
s_corrected["d" + param_name] = s["d" + param_name] / (1 - beta2**t)

# update parameters
parameters[param_name] -= (learning_rate *
v_corrected["d" + param_name])
/ (np.sqrt(s_corrected["d" + param_name] + epsilon))

return parameters, v, s

def update_parameters(parameters, grads, learning_rate):
for param in parameters.keys():
parameters[param] -= learning_rate * grads["d" + param]

return parameters

def update_parameters_with_rmsprop(
parameters, grads, s, beta=0.9, learning_rate=0.001, epsilon=1e-8):

parameters_names = ["Whh", "Wxh", "b", "Why", "c"]

for param_name in parameters_names:
# Update exponential weighted average of squared gradients
s["d" + param_name] = beta * s["d" + param_name] +
(1 - beta) * np.square(grads["d" + param_name])

# Update parameters
parameters[param_name] -= (learning_rate * grads["d" + param_name])
/ (np.sqrt(s["d" + param_name] + epsilon))

return parameters, s

def sample(parameters, idx_to_chars, chars_to_idx, n):

# Retrienve parameters, shapes, and vocab size
Whh, Wxh, b = parameters["Whh"], parameters["Wxh"], parameters["b"]
Why, c = parameters["Why"], parameters["c"]
n_h, n_x = Wxh.shape
vocab_size = c.shape[0]

# Initialize a0 and x1 to zero vectors
h_prev = np.zeros((n_h, 1))
x = np.zeros((n_x, 1))

# Initialize empty sequence
indices = []
idx = -1
counter = 0
while (counter <= n and idx != chars_to_idx["n"]):
# Fwd propagation
h = np.tanh(, h_prev) +, x) + b)
o =, h) + c
probs = softmax(o)

# Sample the index of the character using generated probs distribution
idx = np.random.choice(vocab_size, p=probs.ravel())

# Get the character of the sampled index
char = idx_to_chars[idx]

# Add the char to the sequence

# Update a_prev and x
h_prev = np.copy(h)
x = np.zeros((n_x, 1))
x[idx] = 1

counter += 1
sequence = "".join([idx_to_chars[idx] for idx in indices if idx != 0])

return sequence

def model(
file_path, chars_to_idx, idx_to_chars, hidden_layer_size, vocab_size,
num_epochs=10, learning_rate=0.01):

# Get the data
with open(file_path) as f:
data = f.readlines()
examples = [x.lower().strip() for x in data]

# Initialize parameters
parameters = initialize_parameters(vocab_size, hidden_layer_size)

# Initialize Adam parameters
s = initialize_rmsprop(parameters)

# Initialize loss
smoothed_loss = -np.log(1 / vocab_size) * 7

# Initialize hidden state h0 and overall loss
h_prev = np.zeros((hidden_layer_size, 1))
overall_loss = []

# Iterate over number of epochs
for epoch in range(num_epochs):
print(f"33[1m33[94mEpoch {epoch}")

# Sample one name
print(f"""Sampled name: {sample(parameters, idx_to_chars, chars_to_idx,
print(f"Smoothed loss: {smoothed_loss:.4f}n")

# Shuffle examples

# Iterate over all examples (SGD)
for example in examples:
x = [None] + [chars_to_idx[char] for char in example]
y = x[1:] + [chars_to_idx["n"]]
# Fwd pass
loss, cache = rnn_forward(x, y, h_prev, parameters)
# Compute smooth loss
smoothed_loss = smooth_loss(smoothed_loss, loss)
# Bwd passA
grads, h_prev = rnn_backward(y, parameters, cache)
# Update parameters
parameters, s = update_parameters_with_rmsprop(
parameters, grads, s)


return parameters, overall_loss
# Load names
data = open("rnn.txt", "r").read()

# Convert characters to lower case
data = data.lower()

# Construct vocabulary using unique characters, sort it in ascending order,
# then construct two dictionaries that maps character to index and index to
# characters.
chars = list(sorted(set(data)))
chars_to_idx = {ch:i for i, ch in enumerate(chars)}
idx_to_chars = {i:ch for ch, i in chars_to_idx.items()}

# Get the size of the data and vocab size
data_size = len(data)
vocab_size = len(chars_to_idx)
print(f"There are {data_size} characters and {vocab_size} unique characters.")

# Fitting the model
parameters, loss = model("/content/rnn.txt", chars_to_idx, idx_to_chars, 100, vocab_size, 10, 0.01)

# Plotting the loss
plt.plot(range(len(loss)), loss)
plt.ylabel("Smoothed loss");

loss = list()
for i in range(5):
# fit model for one epoch on this sequence
hist =, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
return loss

Lab 12: Build a LSTM network for Named Entity recognition

