In [1]:
%%html
<style>
.rendered_html {
    font-size: 24px;
}
td {
    font-size: 20px;
}
</style>

Exploring non-contextualized embeddings

In [2]:
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import numpy as np

import heapq
import spacy
In [3]:
nlp = spacy.load('en_core_web_lg')
w.prob

Smoothed log probability estimate of token’s word type (context-independent entry in the vocabulary).

See https://spacy.io/api/token

In [4]:
# for illustration purposes,
# we'll explore only a subset of the vocabulary
# according to a prior probability range
vocab = [w for w in nlp.vocab
         if w.is_lower and -4.5 >= w.prob >= -8.5]

vectors = np.stack([w.vector for w in vocab])

print(vectors.shape)
(363, 300)

Dimensionality reduction using t-SNE (2D)

In [5]:
tsne_model = TSNE(n_components=2, random_state=42)
reduced = tsne_model.fit_transform(vectors)
print(reduced.shape)
(363, 2)
In [6]:
x_coords, y_coords = reduced.T
plt.rc("figure", figsize=[25, 15]); plt.rc("font", size=14)
plt.scatter(x_coords, y_coords, color="white", marker=".")

for word, x, y in zip(vocab, x_coords, y_coords):
    plt.annotate(word.text, xy=(x, y), xytext=(0, 0), textcoords='offset points')

Dimensionality reduction using t-SNE (3D)

In [7]:
tsne_model = TSNE(n_components=3, random_state=42)

reduced = tsne_model.fit_transform(vectors)

print(reduced.shape)
(363, 3)
In [8]:
from mpl_toolkits.mplot3d import Axes3D


def plot_3d():
    x_coords, y_coords, z_coords = reduced.T
    plt.rc("figure", figsize=[40, 40]); plt.rc("font", size=18)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(x_coords, y_coords, z_coords)
    ax.set_xlim3d(min(x_coords)/2, max(x_coords)/2)
    ax.set_ylim3d(min(y_coords)/2, max(y_coords)/2)
    ax.set_zlim3d(min(z_coords)/2, max(x_coords)/2)

    for word, x, y, z in zip(vocab, x_coords, y_coords, z_coords):
        ax.text(x, y, z, word.text, None, color='black')
    plt.show()
In [9]:
plot_3d()

Fun with vector arithmetic

Analogies as transformations in vector space

In [10]:
def vector(word):
    return nlp.vocab[word].vector

def cosine(vector1, vector2):
    # ignore diagonal of symmetric sim. matrix 
    [[_, x], [_, _]] = cosine_similarity([vector1, vector2])
    return x

def nclosest(k, vocab, vector1):
    return [w.text for w in heapq.nlargest(k, vocab, key=lambda w: cosine(vector1, w.vector))]

man is to king as woman is to ...

In [11]:
cosine(vector("queen"), vector("king"))
Out[11]:
0.7252606
In [12]:
female_monarch = vector("king") - vector("man") + vector("woman")

cosine(vector("queen"), female_monarch)
Out[12]:
0.7880841
In [13]:
vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]

nclosest(10, vocab, female_monarch)
Out[13]:
['king',
 'queen',
 'prince',
 'kings',
 'princess',
 'royal',
 'throne',
 'queens',
 'monarch',
 'kingdom']

Can we exaggerate the effect?

In [14]:
female_monarch = vector("king") - vector("man") - vector("prince") + vector("woman")

cosine(vector("queen"), female_monarch)
Out[14]:
0.3986122
In [15]:
vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]

nclosest(10, vocab, female_monarch)
Out[15]:
['queen',
 'breast',
 'women',
 'mattress',
 'breasts',
 'woman',
 'mattresses',
 'sofa',
 'lingerie',
 'lactation']

Italy is to Rome as France is to ...

In [16]:
cosine(vector("Paris"), vector("Rome"))
Out[16]:
0.58241177
In [17]:
french_capital = vector("Rome") - vector("Italy") + vector("France")

cosine(vector("Paris"), french_capital)
Out[17]:
0.71733016
In [18]:
vocab = [w for w in nlp.vocab if w.is_title and w.prob >= -15]

nclosest(10, vocab, french_capital)
Out[18]:
['Rome',
 'France',
 'Paris',
 'Lyon',
 'Europe',
 'French',
 'Roman',
 'Prague',
 'Romans',
 'Versailles']