Back to Article
Word2vec
Download Notebook

Word2vec

Implement a simple word2vec estimator using Gensim. Use the small Wikipedia corpus from ‘enlang1.txt’.

In [2]:
!wget https://raw.githubusercontent.com/mlcollege/natural-language-processing/master/data/corpora/enlang1.txt
In [3]:
!pip install gensim
In [4]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = []
with open('enlang1.txt') as f:
    for line in f.readlines():
        sentences.append(line.strip().split())

model = gensim.models.Word2Vec(sentences, vector_size= 50, min_count=3)
In [5]:
print(model.wv['car'])
[-0.6982817   2.3581364   0.5011609  -0.60018486 -0.33697867  1.502611
 -0.86960953  3.356093    1.2783765  -2.485869   -1.2212014   1.2542228
  1.51994    -2.1035402  -0.6679091  -0.31157556  0.14391106 -1.7851443
  1.3146995  -3.0246205  -0.75486225  1.2262667  -0.6552876  -0.36967877
  0.57343364 -0.86407775 -0.15070374  2.1927333   1.3338499  -0.62077355
 -1.0623326  -0.47078207  0.20871846  2.0925546  -1.8752809   0.90823495
  1.0370363  -0.1633322   0.3681553   1.3255799   1.4786295   1.6821818
  0.9131644  -3.1880014   0.55028105 -1.6433401   3.763089   -0.00381869
 -0.0066908   0.6364674 ]
In [6]:
model.wv.most_similar(positive=['cars', 'bus'], negative=['car'])
[('buses', 0.8990582227706909),
 ('routes', 0.8619767427444458),
 ('roads', 0.8201125860214233),
 ('intercity', 0.819900393486023),
 ('platforms', 0.8192744255065918),
 ('trains', 0.8124856352806091),
 ('rail', 0.806613028049469),
 ('freight', 0.8052050471305847),
 ('stations', 0.7834478616714478),
 ('transport', 0.7786622047424316)]

Import better models

Import word vectors trained on Common Crawl corpus (600 B tokens) and play with it.

In [6]:
!pip install -U --no-cache-dir gdown --pre
!gdown https://drive.google.com/uc?id=1pt_yxrbRIRG4bHufCyIf6POayH3gE6f4
!bunzip2 crawl-300.vec.bz2
In [8]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('crawl-300.vec', binary=False)
2026-04-11 12:38:57,920 : INFO : loading projection weights from crawl-300.vec
2026-04-11 12:43:40,981 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (500000, 300) matrix of type float32 from crawl-300.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2026-04-11T12:43:40.981210', 'gensim': '4.4.0', 'python': '3.12.3 (main, Dec 16 2024, 17:30:55) [GCC 11.4.0]', 'platform': 'Linux-6.5.0-41-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
In [9]:
word_vectors.most_similar(positive=['kings', 'queen'], negative=['king'])
[('queens', 0.8387579917907715),
 ('queen.', 0.6004167199134827),
 ('monarchs', 0.5899762511253357),
 ('Queen', 0.5859925746917725),
 ('empresses', 0.577515184879303),
 ('princes', 0.5499585270881653),
 ('QUEEN', 0.5448766350746155),
 ('royals', 0.5442696809768677),
 ('princesses', 0.5383292436599731),
 ('royal', 0.5232110023498535)]
In [10]:
word_vectors.most_similar(positive=['woman', 'husband'], negative=['man'])
[('wife', 0.7529045343399048),
 ('daughter', 0.6500851511955261),
 ('mother-in-law', 0.6470040082931519),
 ('spouse', 0.6457177400588989),
 ('husbands', 0.6331113576889038),
 ('mother', 0.6005340814590454),
 ('ex-husband', 0.5952432751655579),
 ('daughter-in-law', 0.5948172807693481),
 ('ex-wife', 0.5728636384010315),
 ('daughters', 0.5600825548171997)]
In [11]:
word_vectors.most_similar(positive=['Paris', 'Spain'], negative=['France'])
[('Madrid', 0.8625079989433289),
 ('Barcelona', 0.7637037634849548),
 ('Sevilla', 0.6874054074287415),
 ('Seville', 0.6747831106185913),
 ('Malaga', 0.6494932174682617),
 ('Zaragoza', 0.6459373831748962),
 ('Valencia', 0.6383104920387268),
 ('Alicante', 0.6115807890892029),
 ('Salamanca', 0.6041630506515503),
 ('Murcia', 0.6019026041030884)]
In [12]:
word_vectors.most_similar(positive=['Macron', 'France'], negative=['Portugal'])
[('Hollande', 0.6321580410003662),
 ('Sarkozy', 0.5803262591362),
 ('Chirac', 0.5736144781112671),
 ('Fillon', 0.5704934000968933),
 ('French', 0.5427036285400391),
 ('Melenchon', 0.5314571261405945),
 ('Juppe', 0.5297616720199585),
 ('Mitterrand', 0.5275521278381348),
 ('Juppé', 0.5251182913780212),
 ('Mélenchon', 0.5208674073219299)]