CBOW: The input to the model could be wi−2,wi−1,wi+1,wi+2, the preceding and following words of the current word we are at.
The output of the neural network will be wi. Hence you can think of the task as "predicting the word given its context."
Skip-gram: The input to the model is wi, and the output could be wi−2,wi−1,wi+1,wi+2. So the task here is "predicting the context
given a word."
According to Mikolov:
Skip-gram: works well with small amount of the training data, represents well even rare words or phrases.
CBOW: several times faster to train than the skip-gram, slightly better accuracy for the frequent words
# !pip install --upgrade gensim
import gensim.downloader as api
model = api.load('word2vec-google-news-300')
# !wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
## https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
## https://github.com/mmihaltz/word2vec-GoogleNews-vectors/tree/master
# !gzip -d GoogleNews-vectors-negative300.bin.gz
# from gensim.models import KeyedVectors
# filename = 'GoogleNews-vectors-negative300.bin'
# model = KeyedVectors.load_word2vec_format(filename, binary=True)
pairs = [
('car', 'minivan'), # a minivan is a kind of car
('car', 'bicycle'), # still a wheeled vehicle
('car', 'airplane'), # ok, no wheels, but still a vehicle
('car', 'cereal'), # ... and so on
('car', 'democracy'),
]
for w1, w2 in pairs:
print('%r\t%r\t%.2f' % (w1, w2, model.similarity(w1, w2)))
'car' 'minivan' 0.69 'car' 'bicycle' 0.54 'car' 'airplane' 0.42 'car' 'cereal' 0.14 'car' 'democracy' 0.08
print(model.most_similar(positive=['car', 'minivan'], topn=5))
[('SUV', 0.8532192707061768), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763688564300537), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565720081329346)]
# irrelevant word
print(model.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'language']))
language
# calculate: (king - man) + woman = ?
# Find the top-N most similar words.
# Positive words contribute positively towards the similarity, negative words negatively.
# This method computes cosine similarity between a simple mean of the projection weight vectors
# of the given words and the vectors for each word in the model
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
[('queen', 0.7118193507194519)]
# body parts
result = model.most_similar(positive=['leg','hand'], topn=5)
print(result)
[('forearm', 0.6304388642311096), ('arm', 0.6039904356002808), ('thigh', 0.6026855707168579), ('shoulder', 0.5886558294296265), ('finger', 0.58832186460495)]
%matplotlib inline
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
Countries_Capitals = ["Germany","France","Italy","Spain","Belgium","Berlin","Paris","Rome","Madrid","Brussels"]
vecs = [model[cc] for cc in Countries_Capitals]
vecs = np.stack(vecs, axis=0)
# Fit a 2D PCA model to the vectors
# Principal component analysis (PCA).
# Linear dimensionality reduction using Singular Value Decomposition of
#the data to project it to a lower dimensional space.
#The input data is centered but not scaled for each feature before applying the SVD.
pca = PCA(n_components=2)
result = pca.fit_transform(vecs)
# Create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
# Annotate the points in the scatter plot
for i, word in enumerate(Countries_Capitals):
plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()
from gensim.models import Word2Vec
# Define training data
sentences = [
['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
['this', 'is', 'the', 'second', 'sentence'],
['yet', 'another', 'sentence'],
['one', 'more', 'sentence'],
['and', 'the', 'final', 'sentence']
]
# Train model
# Words below the min_count frequency are dropped before training occurs.
# So, the relevant context window is the word-distance among surviving words.
t_model = Word2Vec(sentences, min_count=1)
# Summarize vocabulary
# words = list(t_model.wv.vocab) # Gensim 3
words = t_model.wv.index_to_key # Gensim 4
print("Vocabulary:")
print(words)
# Access vector for one word
print("Sentence vector:")
print(t_model.wv.__getitem__('sentence'))
# Save model
t_model.save('model.bin')
# Load model
new_model = Word2Vec.load('model.bin')
print("The model:")
print(new_model)
Vocabulary: ['sentence', 'the', 'is', 'this', 'final', 'and', 'more', 'one', 'another', 'yet', 'second', 'word2vec', 'for', 'first'] Sentence vector: [-5.3622725e-04 2.3643136e-04 5.1033497e-03 9.0092728e-03 -9.3029495e-03 -7.1168090e-03 6.4588725e-03 8.9729885e-03 -5.0154282e-03 -3.7633716e-03 7.3805046e-03 -1.5334714e-03 -4.5366134e-03 6.5540518e-03 -4.8601604e-03 -1.8160177e-03 2.8765798e-03 9.9187379e-04 -8.2852151e-03 -9.4488179e-03 7.3117660e-03 5.0702621e-03 6.7576934e-03 7.6286553e-04 6.3508903e-03 -3.4053659e-03 -9.4640139e-04 5.7685734e-03 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04 9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03 8.0774371e-03 -5.9308959e-03 4.5162440e-05 -4.7537340e-03 -9.6035507e-03 5.0072931e-03 -8.7595852e-03 -4.3918253e-03 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03 9.6147433e-03 4.9820580e-03 9.2331432e-03 -8.1579173e-03 4.4957981e-03 -4.1370760e-03 8.2453608e-04 8.4986202e-03 -4.4621765e-03 4.5175003e-03 -6.7869602e-03 -3.5484887e-03 9.3985079e-03 -1.5776526e-03 3.2137157e-04 -4.1406299e-03 -7.6826881e-03 -1.5080082e-03 2.4697948e-03 -8.8802696e-04 5.5336617e-03 -2.7429771e-03 2.2600652e-03 5.4557943e-03 8.3459532e-03 -1.4537406e-03 -9.2081428e-03 4.3705525e-03 5.7178497e-04 7.4419081e-03 -8.1328274e-04 -2.6384138e-03 -8.7530091e-03 -8.5655687e-04 2.8265631e-03 5.4014288e-03 7.0526563e-03 -5.7031214e-03 1.8588197e-03 6.0888636e-03 -4.7980510e-03 -3.1072604e-03 6.7976294e-03 1.6314756e-03 1.8991709e-04 3.4736372e-03 2.1777749e-04 9.6188262e-03 5.0606038e-03 -8.9173904e-03 -7.0415605e-03 9.0145587e-04 6.3925339e-03] The model: Word2Vec<vocab=14, vector_size=100, alpha=0.025>
# Fit a 2d PCA model to the vectors
X = new_model.wv.vectors
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# Create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(new_model.wv.index_to_key)
# Annotate the points in the scatter plot
for i, word in enumerate(words):
plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()