!wget https://raw.githubusercontent.com/mlcollege/natural-language-processing/master/data/corpora/enlang1.txtImplement a simple word2vec estimator using Gensim. Use the small Wikipedia corpus from ‘enlang1.txt’.
In [2]:
In [3]:
!pip install gensimIn [4]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = []
with open('enlang1.txt') as f:
for line in f.readlines():
sentences.append(line.strip().split())
model = gensim.models.Word2Vec(sentences, vector_size= 50, min_count=3)In [5]:
print(model.wv['car'])[-0.6982817 2.3581364 0.5011609 -0.60018486 -0.33697867 1.502611
-0.86960953 3.356093 1.2783765 -2.485869 -1.2212014 1.2542228
1.51994 -2.1035402 -0.6679091 -0.31157556 0.14391106 -1.7851443
1.3146995 -3.0246205 -0.75486225 1.2262667 -0.6552876 -0.36967877
0.57343364 -0.86407775 -0.15070374 2.1927333 1.3338499 -0.62077355
-1.0623326 -0.47078207 0.20871846 2.0925546 -1.8752809 0.90823495
1.0370363 -0.1633322 0.3681553 1.3255799 1.4786295 1.6821818
0.9131644 -3.1880014 0.55028105 -1.6433401 3.763089 -0.00381869
-0.0066908 0.6364674 ]
In [6]:
model.wv.most_similar(positive=['cars', 'bus'], negative=['car'])[('buses', 0.8990582227706909),
('routes', 0.8619767427444458),
('roads', 0.8201125860214233),
('intercity', 0.819900393486023),
('platforms', 0.8192744255065918),
('trains', 0.8124856352806091),
('rail', 0.806613028049469),
('freight', 0.8052050471305847),
('stations', 0.7834478616714478),
('transport', 0.7786622047424316)]
Import better models
Import word vectors trained on Common Crawl corpus (600 B tokens) and play with it.
In [6]:
!pip install -U --no-cache-dir gdown --pre
!gdown https://drive.google.com/uc?id=1pt_yxrbRIRG4bHufCyIf6POayH3gE6f4
!bunzip2 crawl-300.vec.bz2In [8]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('crawl-300.vec', binary=False)2026-04-11 12:38:57,920 : INFO : loading projection weights from crawl-300.vec
2026-04-11 12:43:40,981 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (500000, 300) matrix of type float32 from crawl-300.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2026-04-11T12:43:40.981210', 'gensim': '4.4.0', 'python': '3.12.3 (main, Dec 16 2024, 17:30:55) [GCC 11.4.0]', 'platform': 'Linux-6.5.0-41-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
In [9]:
word_vectors.most_similar(positive=['kings', 'queen'], negative=['king'])[('queens', 0.8387579917907715),
('queen.', 0.6004167199134827),
('monarchs', 0.5899762511253357),
('Queen', 0.5859925746917725),
('empresses', 0.577515184879303),
('princes', 0.5499585270881653),
('QUEEN', 0.5448766350746155),
('royals', 0.5442696809768677),
('princesses', 0.5383292436599731),
('royal', 0.5232110023498535)]
In [10]:
word_vectors.most_similar(positive=['woman', 'husband'], negative=['man'])[('wife', 0.7529045343399048),
('daughter', 0.6500851511955261),
('mother-in-law', 0.6470040082931519),
('spouse', 0.6457177400588989),
('husbands', 0.6331113576889038),
('mother', 0.6005340814590454),
('ex-husband', 0.5952432751655579),
('daughter-in-law', 0.5948172807693481),
('ex-wife', 0.5728636384010315),
('daughters', 0.5600825548171997)]
In [11]:
word_vectors.most_similar(positive=['Paris', 'Spain'], negative=['France'])[('Madrid', 0.8625079989433289),
('Barcelona', 0.7637037634849548),
('Sevilla', 0.6874054074287415),
('Seville', 0.6747831106185913),
('Malaga', 0.6494932174682617),
('Zaragoza', 0.6459373831748962),
('Valencia', 0.6383104920387268),
('Alicante', 0.6115807890892029),
('Salamanca', 0.6041630506515503),
('Murcia', 0.6019026041030884)]
In [12]:
word_vectors.most_similar(positive=['Macron', 'France'], negative=['Portugal'])[('Hollande', 0.6321580410003662),
('Sarkozy', 0.5803262591362),
('Chirac', 0.5736144781112671),
('Fillon', 0.5704934000968933),
('French', 0.5427036285400391),
('Melenchon', 0.5314571261405945),
('Juppe', 0.5297616720199585),
('Mitterrand', 0.5275521278381348),
('Juppé', 0.5251182913780212),
('Mélenchon', 0.5208674073219299)]