Word Vectors
September 05, 2015Word vectors is a cool idea to pack word information in to an $R^n$ vector. The difference from the bag of words method is that $n$ is smaller than the size of dictionary. Aditionally as a side effect of optimisation word vectors have very interesting features.
Word vectors are result of maximisation of probability of finding word in a specifc context:
$$ J( \theta ) = \frac{1}{T} \sum_{i=1}^T \sum_{-c\le j\le c, j\ne 0} \log p( w_{i+j} \mid w_i ) $$The probability function $ p( w_{i+j} \mid w_i ) $ can take many forms, however the original paper uses a form of soft max [1]:
$$ p( w_{i+j} \mid w_i ) = \frac{ e^{ v_{w_o}^T v_{w_i} } }{ \sum_{w=1}^W e^{ v_{w_o}^T v_{w_i} } } $$There are couple of problems when searching parameters for an above function as computing the denominator is expensive and created matrix can be very large depending on number of words in the dictionary. For this reason we will use gensim package that wraps the search algorithm [2].
This code is partially based on Kaggle's Bag of Words meets Bag of Popcorn [3]
1 - Efficient Estimation of Word Representations in Vector Space
2 - gensim: Topic modelling for humans - Radim Řehůřek
3 - Word Vectors
%matplotlib inline
import numpy as np
import pandas as pd
import gensim
import logging
import re
import nltk.data
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
# load data
df = pd.read_csv( 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3 )
df_unlabelled = pd.read_csv( 'unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3 )
# train test / ratio of 0.66
tt_index = np.random.binomial( 1, 0.66, size=df.shape[0] )
train = df[ tt_index == 1 ]
test = df[ tt_index == 0 ]
print( 'train shape: {0}'.format( train.shape ) )
print( 'test shape: {0}'.format( test.shape ) )
print( 'unlabelled shape: {0}'.format( df_unlabelled.shape ) )
# borrowed from Kaggle https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
def review_to_wordlist( review, remove_stopwords=False ):
# remove HTML
review_text = BeautifulSoup(review).get_text()
review_text = re.sub("[^a-zA-Z]"," ", review_text)
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return ( words )
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
raw_sentences = tokenizer.tokenize( review.strip().decode('utf8', 'ignore') )
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))
return sentences
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
for review in train[ 'review' ]:
sentences += review_to_sentences(review, tokenizer)
for review in df_unlabelled[ 'review' ]:
sentences += review_to_sentences(review, tokenizer)
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )
num_features = 300 # dimensionality
min_word_count = 40 # minimum word count
num_workers = 6 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words
model = gensim.models.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# free memory
model.init_sims( replace=True )
# save model
model_name = "300features_40minwords_10context"
model.save(model_name)
# stat
print( 'total run time: {0} [s]'.format( model.total_train_time ) )
nan_words = {}
def makeFeatureVec( words, model, num_features, index2word_set ):
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
for word in words:
if word in index2word_set:
nwords = nwords + 1.
if np.isnan( model[ word ] ).any():
if word in nan_words:
nan_words[ word ] += 1
else:
nan_words[ word ] = 1
featureVec = np.add(featureVec,model[word])
if nwords != 0:
featureVec = np.divide(featureVec,nwords)
return featureVec
def getAvgFeatureVecs(reviews, model, num_features, index2word_set ):
counter = 0.
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
for review in reviews:
if counter % 1000 == 0.:
print "Review %d of %d" % (counter, len(reviews))
reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features, index2word_set )
counter = counter + 1.
return reviewFeatureVecs
index2word_set = set( model.index2word )
clean_train_reviews = []
for review in train[ 'review' ]:
clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ) )
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features, index2word_set )
clean_test_reviews = []
for review in test[ 'review' ]:
clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ) )
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features, index2word_set )
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
x_params = { 'max_depth': [ 4, 8, 12 ],
'n_estimators': [ 200, 500, 1000 ],
'objective': [ 'binary:logistic' ],
}
xgb_model = xgb.XGBClassifier()
clf = GridSearchCV(xgb_model, x_params, verbose=1, n_jobs=1)
clf.fit( trainDataVecs, train[ 'sentiment' ] )
print(clf.best_score_)
print(clf.best_params_)
score = clf.score( testDataVecs, test[ 'sentiment' ] )
print( 'score: {0}'.format( score ) )
from sklearn.metrics import roc_curve, auc
proba = clf.predict_proba( testDataVecs )[ :, 1 ]
fpr, tpr, thresholds = roc_curve( test[ 'sentiment' ].ravel(), proba.ravel() )
roc_auc = auc( fpr, tpr )
# Plot Precision-Recall curve
plt.clf()
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc )
plt.legend(loc="lower right")
plt.show()
OK, AUC is 94% for word vectors vs 90% for simple bag of words. Not bad ... but let's take a look at few data examples to see what's going on under the hood.
diffs = np.where( clf.predict( testDataVecs ) != test[ 'sentiment' ] )[0]
# print 1st few differences
print( diffs[ 1:3 ] )
BeautifulSoup( test.iloc[ 14, 2 ] ).get_text()
BeautifulSoup( test.iloc[ 19, 2 ] ).get_text()
Well, hard to say what is driving these errors. My bet is on double negations again, which simple classification methods will definitely strugle with.
For the next encounter with NLP we will bring heavy machinery: Recurrent Neural Networks and see how they would perform with this task.