# TD: express sentiment classifier (for English)

(copied from Robyn Speer, with her consent)

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading static embeddings for English (here, GloVe)

From: https://nlp.stanford.edu/data/glove.42B.300d.zip

Unzip the file in the data/ repository

It takes some time to load...

In [2]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings('data/glove.42B.300d.txt')
embeddings.shape

(1917494, 300)

## Let's load a sentiment lexicon

Here (first URL) : https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon

Save it into you notebook, under data/.

In [3]:
def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon('data/opinion-lexicon-English/positive-words.txt')
neg_words = load_lexicon('data/opinion-lexicon-English/negative-words.txt')

The data points we have here are the embeddings of these positive and negative words. We use the pandas .loc[] function to obtain the words embeddings.

Some words are not in GloVe vocabulary, in particular typos like "fancinating". These words will therefore correspond to lines full of NaN indicating missing embeddings. We must get rid of them using embeddings.index.intersection(pos_words).

In [4]:
pos_vectors = embeddings.loc[embeddings.index.intersection(pos_words)]#.dropna()
neg_vectors = embeddings.loc[embeddings.index.intersection(neg_words)]#.dropna()

Now let's create input and output tables. As input, we have the embeddings and the output is 1 for words qualified as positive and -1 for the negative ones. We make sure to keep the corresponding words in memory in order to be able to interpret the results.

In [5]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

We use the scikit-learn train_test_split functions to simultaneously split input vectors, output values and labels between training and test (10 %) data.

In [6]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

Now, let's train the classifier (100 iterations). 

In [7]:
model = SGDClassifier(loss='log', random_state=0, max_iter=100)#n_iter
model.fit(train_vectors, train_targets)

SGDClassifier(loss='log', max_iter=100, random_state=0)

## Let's test the accuracy of the model


In [8]:
accuracy_score(model.predict(test_vectors), test_targets)

0.9366515837104072

Let's now define a function that allows to visualize the polarity the classifier predicts for some words, then let's apply this to the test set data:

In [9]:
def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

# Show 20 examples from the test set
words_to_sentiment(test_labels).iloc[:20] #.ix


Unnamed: 0,sentiment
omit,-9.979709
outrageously,-3.34653
invincibility,-3.628069
blunder,-6.364943
war-like,-4.96119
beautifullly,-1.054522
supple,9.610283
long-time,5.568841
frenetically,-2.996245
badly,-13.244803


We can now compare sentence polarities with the following (very basic) piece of code:

In [10]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.


def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

In [16]:
text_to_sentiment("Nancy rocks!")

1.3996349123031346

In [12]:
text_to_sentiment("Nancy rocks")

1.3996349123031346

In [17]:
text_to_sentiment("NLP rocks")

0.28419300771151323

In [15]:
text_to_sentiment("My name is Karen")

1.92323228738446