%%html
<style>
.rendered_html {
font-size: 24px;
}
td {
font-size: 20px;
}
</style>
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import heapq
import spacy
nlp = spacy.load('en_core_web_lg')
w.prob
Smoothed log probability estimate of token’s word type (context-independent entry in the vocabulary).
# for illustration purposes,
# we'll explore only a subset of the vocabulary
# according to a prior probability range
vocab = [w for w in nlp.vocab
if w.is_lower and -4.5 >= w.prob >= -8.5]
vectors = np.stack([w.vector for w in vocab])
print(vectors.shape)
Dimensionality reduction using t-SNE (2D)
tsne_model = TSNE(n_components=2, random_state=42)
reduced = tsne_model.fit_transform(vectors)
print(reduced.shape)
x_coords, y_coords = reduced.T
plt.rc("figure", figsize=[25, 15]); plt.rc("font", size=14)
plt.scatter(x_coords, y_coords, color="white", marker=".")
for word, x, y in zip(vocab, x_coords, y_coords):
plt.annotate(word.text, xy=(x, y), xytext=(0, 0), textcoords='offset points')
Dimensionality reduction using t-SNE (3D)
tsne_model = TSNE(n_components=3, random_state=42)
reduced = tsne_model.fit_transform(vectors)
print(reduced.shape)
from mpl_toolkits.mplot3d import Axes3D
def plot_3d():
x_coords, y_coords, z_coords = reduced.T
plt.rc("figure", figsize=[40, 40]); plt.rc("font", size=18)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_coords, y_coords, z_coords)
ax.set_xlim3d(min(x_coords)/2, max(x_coords)/2)
ax.set_ylim3d(min(y_coords)/2, max(y_coords)/2)
ax.set_zlim3d(min(z_coords)/2, max(x_coords)/2)
for word, x, y, z in zip(vocab, x_coords, y_coords, z_coords):
ax.text(x, y, z, word.text, None, color='black')
plt.show()
plot_3d()
Analogies as transformations in vector space
def vector(word):
return nlp.vocab[word].vector
def cosine(vector1, vector2):
# ignore diagonal of symmetric sim. matrix
[[_, x], [_, _]] = cosine_similarity([vector1, vector2])
return x
def nclosest(k, vocab, vector1):
return [w.text for w in heapq.nlargest(k, vocab, key=lambda w: cosine(vector1, w.vector))]
man
is to king
as woman
is to ...¶cosine(vector("queen"), vector("king"))
female_monarch = vector("king") - vector("man") + vector("woman")
cosine(vector("queen"), female_monarch)
vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]
nclosest(10, vocab, female_monarch)
Can we exaggerate the effect?
female_monarch = vector("king") - vector("man") - vector("prince") + vector("woman")
cosine(vector("queen"), female_monarch)
vocab = [w for w in nlp.vocab if w.is_lower and w.prob >= -15]
nclosest(10, vocab, female_monarch)
Italy
is to Rome
as France
is to ...¶cosine(vector("Paris"), vector("Rome"))
french_capital = vector("Rome") - vector("Italy") + vector("France")
cosine(vector("Paris"), french_capital)
vocab = [w for w in nlp.vocab if w.is_title and w.prob >= -15]
nclosest(10, vocab, french_capital)