# reading book text
= open('pg39.txt')
book = book.read() book_content
This blog post explains the procedure to train word embeddings from scratch. Word embeddings are dense vector representation of text data (Check this blog pos for the basics).
Word embeddings can be learned from a text corpora in both, supervised
and unsupervised
ways. In supervised way, we prepare our text data in such a way that each data record has a ground truth. While, the unsupervised way utilizes the text data without any ground truth.
In this post, we will explore a supervised way of learning word embeddings i.e., Continuous Bag of Words (CBOW).
Continuous Bag of Words (CBOW)
In CBOW, we aim to predict a word given its surrounding words in a pre-defined size of window.
For our task, we will use free text available through Project Gutenberg. There are thousands of books are available. For our task, we will use ‘Hitchhikers guide to Internet’ which is available at this link.
The book content is in raw form which we need to preprocess and transform into a format which can be utilize for learning word embeddings. For that, we will first read the book content and breaks it into a list of sentences. To do that we will use punkt
sentence tokenizer from NLTK
library.
The dataset chosen is this example is very small from the NLP perspectives where text corpora usually contains millions words. This is just for the sake of learning and keep things simple.
import nltk.data
import string
import re
# breaking the text into sentences
= nltk.data.load('tokenizers/punkt/english.pickle')
sent_tokenizer = sent_tokenizer.tokenize(book_content.strip())
sentences
print('Number of sentences:',len(sentences))
Number of sentences: 519
# converting sentences into lower case
=[sent.lower() for sent in sentences]
l_sentences
= []
processed # we will add spaces if there are punctuation in the text (for splitting purposes)
for sent in l_sentences:
= re.sub(r"([?!,])",r" \1 ",sent)
text = re.sub(r"[^a-zA-Z,?!]",r" ",sent)
text
processed.append(text)
= []
final
for sentence in processed:
# tokenize the sentences
= nltk.tokenize.word_tokenize(sentence)
tokens
# removing punctuation
= [token for token in tokens if token not in set(string.punctuation)]
processed_tokens
final.append(processed_tokens)
print('Samples:',final[10])
Samples: ['a', 'few', 'bsd', 'systems', 'on', 'an', 'ethernet', 'not', 'connected', 'to', 'anywhere', 'else']
So now we have our text pre-processed. We will now move to preparing a dataset file using our processed dataset.
def get_cbow_instances(text,window_size=2):
"""
Argument:
---------
text: str
tokenized form of a sentence
Return:
list
list of tuples with x,y
"""
= 2 * window_size + 1
max_elements_in_window
= []
tuples = 0
current_index
= []
instances
while True:
if current_index == len(text):
break
= current_index
current_window_min = current_index
current_window_max
for min_option in range(window_size + 1):
if (current_index - min_option) >= 0:
= current_index - min_option
current_window_min
for max_option in range(window_size + 1):
if (current_index + max_option) < len(text):
= current_index + max_option
current_window_max
= text[current_window_min:current_window_max+1]
current_text
= current_index - current_window_min
current_relative_index
= []
current_context for ind,t in enumerate(current_text):
if ind == current_relative_index:
continue
current_context.append(t)
" ".join(current_context)))
instances.append((text[current_index],+= 1
current_index return instances
= ['the', 'project', 'gutenberg',
d 'ebook', 'of', 'hitchhiker',
"'s", 'guide', 'to', 'the', 'internet',
'this', 'ebook', 'is', 'for', 'the', 'use',
'of', 'anyone', 'anywhere', 'in', 'the', 'united',
'states', 'and', 'most', 'other', 'parts', 'of', 'the',
'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no',
'restrictions', 'whatsoever']
= get_cbow_instances(d,window_size=2)
cbow_instances print(cbow_instances[:5])
[('the', 'project gutenberg'), ('project', 'the gutenberg ebook'), ('gutenberg', 'the project ebook of'), ('ebook', 'project gutenberg of hitchhiker'), ('of', "gutenberg ebook hitchhiker 's")]
Now we will process the all the sentences and prepare a DataFrame to build our model.
import pandas as pd
= []
x = []
y for sent in final:
= get_cbow_instances(sent)
instances
for pair in instances:
1])
x.append(pair[0])
y.append(pair[
= pd.DataFrame({'x':x,'y':y})
df
'book.csv',index=False) df.to_csv(
Preparing dataset for PyTorch
We will now vectorize our input data to use for learning embeddings using PyTorch. In a previous post, we delved deeper into the vectorization process using vocabulary and vectorizer. You can check that post.
We will briefly explain that process here.
- First, we will build a vocabulary which mainly offers mapping between all the unique words in the corpus and their indices.
- Second, we transform our text input in a vector of the size of vocabulary.
- Third, we will use PyTorch Dataset and DataLoader classes to finalized our dataset for training
For the first step, we will reuse our Vocabulary class from a previous post. The source code is given below. For more details on the code please refer to the post.
class CBOWVocabulary(object):
def __init__(self, token_to_idx = None, add_unk=True, unk_token='<UNK>'):
"""
params:
token_to_idx (dict): mapping from token to index
add_unk (bool): flag to add a special token to the vocabulary for unknowns tokens
unk_token (str): Token used as special token
"""
if token_to_idx is None:
={}
token_to_idx self._token_to_idx = token_to_idx
self._idx_to_token = {idx:token for token,idx in token_to_idx}
self._add_unk = add_unk
self._unk_token = unk_token
self.unk_index = -1
if add_unk:
self.unk_index = self.add_token(unk_token)
def add_token(self,token):
"""
Add token to the vocabulary
params:
token (str): token to add to the vocabulary
returns:
idx (int): index of token
"""
if token in self._token_to_idx:
return self._token_to_idx[token]
else:
= len(self)
idx self._token_to_idx[token] = idx
self._idx_to_token[idx] = token
return idx
def lookup_idx(self,idx):
"""
Lookup vocabulary to fetch token at idx
params:
idx(int) : index of token to be fetched
returns:
token (str): token stored at idx
"""
if idx not in self._idx_to_token:
raise KeyError("Vocabulary does not have token with specified index:"%idx)
return self._idx_to_token[idx]
def lookup_token(self,token):
"""
Lookup vocabulary to fetch index of a token
params:
token(str): token to lookup
returns:
idx (int): index of token
"""
if token not in self._token_to_idx:
return self.unk_index
else:
return self._token_to_idx[token]
def __len__(self):
return len(self._idx_to_token)
def __str__(self):
return "Vocabulary (size = %d)" % len(self)
We will now populate our vocabulary with the text data from the book.
The following code add all tokens to the vocabulary. One thing to note here is that we are adding every token but in practice you may find discarding some of them with very low frequency count.
# Build CBOWVocabulary
= CBOWVocabulary()
vocab
# Populate vocabulary
for sent in final:
for tok in sent:
vocab.add_token(tok)
# printing size
print(len(vocab))
1898
Now, we will prepare our Vectorizer
. The vectorizer will transform our input text data which into a vector containing indices of tokens
in the Vocabulary.
For example, in our vocabulary, the indices of words project
and gutenberg
are 0
, 1
respectively. That means when we apply the vectorizer, it will return a vector containing 0 and 1.
We know that the window size chosen was 2. That means the maximum number of items in the vector representing context can be 4 (2 in the left side and 2 in the right side). We also know that context can also be made of two words, e.g., context project gutenberg
for the word the
. To keep the size of the vector consitent, we add padding to our vector for cases when the context data has less than four words.
The following code achieves that functionality. It returns a vector of size 4 for specified context data.
import numpy as np
def vectorizer(context,window_size=2):
"""
This function transforms the context text into a vector of integer representing indices of words in the text.
Argument:
---------
context: str
a string containing context words
window_size: int
window size to determine the size of the returned vector and add padding
Returns:
np.array
an array of indices of words in context
"""
# in the context there can be max of double of window size, e.g., 2 words in left size, 2 in right side
= 2 * window_size
max_context_size
= np.zeros(max_context_size)
vector
for ind, word in enumerate(context.split(" ")):
= vocab.lookup_token(word)
vector[ind]
return vector
'project gutenberg') vectorizer(
array([2., 3., 0., 0.])
Let’s now move to the final step of preparing the dataset for training our classifier. We will now use PyTorch’s Dataset and DataLoader classes to simplify the process of generating batches for training in a format needed.
The Dataset and DataLoader classes take care of transforming arrays into tensors, and preparation of batches.
We will create a new class CBOWDataset
which basically implements two functions __getitem__
and __len__
. The first function returns the vectorized context data and the index of the word to predict. The second function returns the number of total instances in the dataset.
from torch.utils.data import Dataset, DataLoader
class CBOWDataset(Dataset):
def __init__(self,df):
"""
Argument:
--------
df: Pandas DataFrame
dataframe containing context and word to predict
"""
self.df = df
def __getitem__(self,index):
"""
Returns vectorized context data and the index of the word to predict
"""
= self.df.iloc[index,:].to_dict()
record
return {'input_x':vectorizer(record['x']),'y':vocab.lookup_token(record['y'])}
def __len__(self):
"""
Returns the size of dataset
"""
return self.df.shape[0]
Building model using PyTorch
We will now build our model architecture using PyTorch. PyTorch offers an Embedding
layer which makes it easy to handle word embeddings. The layer used to store and retrieve word embeddings using indices. That’s the reason why in our vectorized form we only have indices of tokens.
We will use an Embedding
layer in the start of our model which will then fetch the corresponding word embeddings and pass them to the next layer.
To create an Embedding
layer, we need to specify num_embeddings
that is the size of vocabulary (e.g., total number of tokens in the vocabulary), embeddings_dim
that is the number of dimensions (i.e., how many dimensions to use to represent a word embedding, e.g., 100, 200).
During our vectorization process, we added padding (i.e., 0) to the vector when the size was less than four. To tell the model that this 0 should not have any effect during training (or in other words model must not update these values during training), we can specify padding_idx
as 0.
import torch
from torch import nn
import torch.nn.functional as F
= len(vocab)
num_tokens = 50
embedding_dim
class CBOWClassifier(nn.Module):
"""
Classifier for learning word embeddings
"""
def __init__(self,vocab_size,embedding_dim):
"""
Arguments:
vocab_size: int
Size of vocabulary
embedding_size: int
Embedding dimensions
"""
super(CBOWClassifier,self).__init__()
self.embedding = nn.Embedding(num_embeddings=vocab_size,
=embedding_dim,
embedding_dim=0)
padding_idx
self.fc1 = nn.Linear(in_features=embedding_dim,out_features=vocab_size)
def forward(self,input_x,apply_softmax=False):
"""
Performs the forward pass
Arguments:
input_x: torch.tensor
input tensor of shape (batch_size, input_dim)
apply_softmax: bool
flag to perform softmax
"""
= self.embedding(input_x).sum(dim=1)
intermediate
= self.fc1(intermediate)
output
if apply_softmax:
= f.soft_max(output, dim=1)
output return output
Training CBOW Classifier
The following training procedure utilizes the entire dataset. This is just for the sake of learning and keep the post simple to understand.
In practice, the dataset is often divided into three parts: train, validation, test. It is recommeded to the follow the same when working on a real-world ML project.
import torch.optim as optim
= 30
num_epochs = CBOWClassifier(num_tokens,100)
classifier
= CBOWDataset(df)
dataset = DataLoader(dataset,batch_size=50)
loader = optim.Adam(classifier.parameters(),lr=.001)
adam = nn.CrossEntropyLoss()
loss_fnc
classifier.train()for epoch in range(num_epochs):
= 0.0
running_loss
for batch_index, batch in enumerate(loader):
# setting gradients from previous epochs to zero
adam.zero_grad()
# forward pass
= classifier(batch['input_x'].int())
output
# computing loss
= loss_fnc(output,batch['y'])
loss
# backward pass
loss.backward()
#update parameters
adam.step()
+= (loss.item() - running_loss)/(batch_index+1)
running_loss
print('Epoch:',epoch,' Loss:',running_loss)
Epoch: 0 Loss: 7.323485131575679
Epoch: 1 Loss: 5.867017787193583
Epoch: 2 Loss: 5.039742054226242
Epoch: 3 Loss: 4.407589763124413
Epoch: 4 Loss: 3.8932629177503517
Epoch: 5 Loss: 3.471296124369183
Epoch: 6 Loss: 3.1283544541519386
Epoch: 7 Loss: 2.8483516849647055
Epoch: 8 Loss: 2.614800505548995
Epoch: 9 Loss: 2.415219902992248
Epoch: 10 Loss: 2.241242004332143
Epoch: 11 Loss: 2.087147696970779
Epoch: 12 Loss: 1.9489659980357257
Epoch: 13 Loss: 1.8238577800933438
Epoch: 14 Loss: 1.7096240920162649
Epoch: 15 Loss: 1.6044389607611103
Epoch: 16 Loss: 1.5074336672915487
Epoch: 17 Loss: 1.417175150363245
Epoch: 18 Loss: 1.3331712223589423
Epoch: 19 Loss: 1.2546023246701625
Epoch: 20 Loss: 1.1812430473967133
Epoch: 21 Loss: 1.1125499971002064
Epoch: 22 Loss: 1.0486516395461891
Epoch: 23 Loss: 0.9879383787721678
Epoch: 24 Loss: 0.9312528835731407
Epoch: 25 Loss: 0.8782325799481193
Epoch: 26 Loss: 0.8286848590909462
Epoch: 27 Loss: 0.7823101626709101
Epoch: 28 Loss: 0.7389837774474207
Epoch: 29 Loss: 0.6985426304435415
Using trained embeddings
Now, we will use our trained embeddings to find words which are close to a specified word. To do this task we will follow the following steps
- Get the weights of embedding layer of the classifier (which are the word embeddings)
- Obtain the index of the word for which we want to search close words
- Obtain the word embeddings of the word using its index
- Iterate over all the words in the vocabulary, obtaining their indices, fetching their word embeddings and then computing the distance
- Get n words with highest distance measures
def get_close_words(word_to_search, word_to_index,embeddings,n=10):
"""
Get n closes words to the specified word
Arguments:
----------
word_to_search: str
word which we want to search
word_to_index: dictionary
mapping from word to index in vocabulary
n: int
number of words to return
Returns:
-------
list: a list of n words which are closest to the specified word.
"""
= embeddings[word_to_index[word_to_search]]
word_embedding
= []
distances
for word, index in word_to_index.items():
if word == '<UNK>' or word == word_to_search:
continue
distances.append((word, torch.dist(word_embedding, embeddings[index]).item()))
= sorted(distances, key=lambda x: x[1])
sort_distances
return sort_distances[1:n+2]
= classifier.embedding.weight.data
embeddings
= get_close_words('network',vocab._token_to_idx,embeddings,n=5)
close_words
for c in close_words:
print('[{:.2f}] -- {}'.format(c[1],c[0]))
[12.84] -- directional
[13.04] -- tables
[13.06] -- sections
[13.15] -- city
[13.15] -- users
[13.20] -- backs