%%html
<style>
.rendered_html {
    font-size: 24px;
}
td {
    font-size: 20px;
}
</style>

Exploring non-contextualized embeddings¶

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import numpy as np

import heapq
import spacy

nlp = spacy.load('en_core_web_lg')

w.prob

Smoothed log probability estimate of token’s word type (context-independent entry in the vocabulary).

See https://spacy.io/api/token

# for illustration purposes,
# we'll explore only a subset of the vocabulary
# according to a prior probability range
vocab = [w for w in nlp.vocab
         if w.is_lower and -4.5 >= w.prob >= -8.5]

vectors = np.stack([w.vector for w in vocab])

print(vectors.shape)

(363, 300)

Dimensionality reduction using t-SNE (2D)

tsne_model = TSNE(n_components=2, random_state=42)
reduced = tsne_model.fit_transform(vectors)
print(reduced.shape)

(363, 2)

x_coords, y_coords = reduced.T
plt.rc("figure", figsize=[25, 15]); plt.rc("font", size=14)
plt.scatter(x_coords, y_coords, color="white", marker=".")

for word, x, y in zip(vocab, x_coords, y_coords):
    plt.annotate(word.text, xy=(x, y), xytext=(0, 0), textcoords='offset points')

Dimensionality reduction using t-SNE (3D)

tsne_model = TSNE(n_components=3, random_state=42)

reduced = tsne_model.fit_transform(vectors)

print(reduced.shape)

(363, 3)

from mpl_toolkits.mplot3d import Axes3D


def plot_3d():
    x_coords, y_coords, z_coords = reduced.T
    plt.rc("figure", figsize=[40, 40]); plt.rc("font", size=18)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(x_coords, y_coords, z_coords)
    ax.set_xlim3d(min(x_coords)/2, max(x_coords)/2)
    ax.set_ylim3d(min(y_coords)/2, max(y_coords)/2)
    ax.set_zlim3d(min(z_coords)/2, max(x_coords)/2)

    for word, x, y, z in zip(vocab, x_coords, y_coords, z_coords):
        ax.text(x, y, z, word.text, None, color='black')
    plt.show()

plot_3d()

https://projector.tensorflow.org

Fun with vector arithmetic¶

Analogies as transformations in vector space

def vector(word):
    return nlp.vocab[word].vector

def cosine(vector1, vector2):
    # ignore diagonal of symmetric sim. matrix 
    [[_, x], [_, _]] = cosine_similarity([vector1, vector2])
    return x

def nclosest(k, vocab, vector1):
    return [w.text for w in heapq.nlargest(k, vocab, key=lambda w: cosine(vector1, w.vector))]

`man` is to `king` as `woman` is to ...¶

cosine(vector("queen"), vector("king"))

0.7252606

female_monarch = vector("king") - vector("man") + vector("woman")

cosine(vector("queen"), female_monarch)

0.7880841

vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]

nclosest(10, vocab, female_monarch)

['king',
 'queen',
 'prince',
 'kings',
 'princess',
 'royal',
 'throne',
 'queens',
 'monarch',
 'kingdom']

Can we exaggerate the effect?

female_monarch = vector("king") - vector("man") - vector("prince") + vector("woman")

cosine(vector("queen"), female_monarch)

0.3986122

vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]

nclosest(10, vocab, female_monarch)

['queen',
 'breast',
 'women',
 'mattress',
 'breasts',
 'woman',
 'mattresses',
 'sofa',
 'lingerie',
 'lactation']

`Italy` is to `Rome` as `France` is to ...¶

cosine(vector("Paris"), vector("Rome"))

0.58241177

french_capital = vector("Rome") - vector("Italy") + vector("France")

cosine(vector("Paris"), french_capital)

0.71733016

vocab = [w for w in nlp.vocab if w.is_title and w.prob >= -15]

nclosest(10, vocab, french_capital)

['Rome',
 'France',
 'Paris',
 'Lyon',
 'Europe',
 'French',
 'Roman',
 'Prague',
 'Romans',
 'Versailles']

Exploring non-contextualized embeddings¶

Fun with vector arithmetic¶

man is to king as woman is to ...¶

Italy is to Rome as France is to ...¶

`man` is to `king` as `woman` is to ...¶

`Italy` is to `Rome` as `France` is to ...¶