Word2Vec

Word2Vec trains a small neural network to predict which words appear near each other. The hidden-layer weights become word embeddings — vectors that place similar words close together in space.

This notebook builds a skip-gram Word2Vec model from scratch using NumPy, then visualises the learned embeddings with PCA.

First, we define our corpus - this is the data that we will train on.

corpus = [
  "the dog barks loudly",
  "the cat meows softly",
  "the dog chases the cat",
  "the cat sleeps quietly",
  "the dog runs fast",
  "the bird flies high",
  "the bird sings beautifully",
  "the fish swims fast",
  "the fish lives in water",
  "dogs and cats are pets",
  "birds and fish are animals",
  "dogs love to run",
  "cats love to sleep",
  "birds love to sing",
  "fish love to swim",
  "the dog and cat play together",
  "animals need food and water",
  "pets live with people",
  "wild birds fly freely",
  "big dogs run fast",
]

all_words = [w for sentence in corpus for w in sentence.split()]
vocab = sorted(set(all_words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}
vocab_size = len(vocab)
print(f"Vocabulary: {vocab_size} words: {', '.join(vocab)}")

Now we generate the training pairs. These are pairs of words as they are next to each other in the corpus.

def generate_pairs(corpus, window_size=2):
  pairs = []
  for sentence in corpus:
    indices = [word_to_idx[w] for w in sentence.split()]
    for i, center in enumerate(indices):
      for j in range(max(0, i - window_size), min(len(indices), i + window_size + 1)):
        if i != j:
          pairs.append((center, indices[j]))
  return pairs

pairs = generate_pairs(corpus)
print(f"Training pairs: {len(pairs)}\n")
print(f"Example pair: {pairs[16:17]}")

Let’s do some training!

import numpy as npimport matplotlib.pyplot as pltfrom sklearn.decomposition import PCA# --- Model weights ---embedding_dim = 10learning_rate = 0.05epochs = 300np.random.seed(42)W1 = np.random.randn(vocab_size, embedding_dim) * 0.1  # input embeddingsW2 = np.random.randn(embedding_dim, vocab_size) * 0.1  # output weightsdef softmax(x):  e = np.exp(x - np.max(x))  return e / e.sum()# --- Training loop ---print("Training...")for epoch in range(epochs):  lr = learning_rate * (1 - epoch / epochs)  # decay: large steps early, small steps late  total_loss = 0  for center_idx, context_idx in pairs:    h = W1[center_idx]              # embedding lookup    y_hat = softmax(h @ W2)         # predicted distribution    total_loss -= np.log(y_hat[context_idx] + 1e-10)    error = y_hat.copy()    error[context_idx] -= 1         # cross-entropy + softmax gradient    dW2 = np.outer(h, error)    dW1 = W2 @ error    W2 -= lr * dW2    W1[center_idx] -= lr * dW1  if (epoch + 1) % 60 == 0:    print(f"  Epoch {epoch+1}/{epochs}  loss: {total_loss / len(pairs):.4f}")print("Training complete!")

Let’s test by looking up words that are similar to each other…

# --- Nearest-neighbour lookup ---
def cosine_sim(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))

def most_similar(word, n=10):
    vec = W1[word_to_idx[word]]
    sims = sorted([(w, cosine_sim(vec, W1[word_to_idx[w]])) for w in vocab if w != word], key=lambda x: -x[1])
    return sims[:n]

print()
WORD = "dog" #@param ["dog", "cat", "bird", "fish", "run", "water"]
#for word in ["dog", "cat", "bird", "fish", "run", "water"]:
result = "\n".join(f"{w} ({s:.2f})" for w, s in most_similar(WORD))
print(f"Similar to '{WORD}':\n{result}")

Before looking at the scatter plot, take a moment to record what you noticed.

Look at the words listed as most similar to ‘dog’. Do the results make sense? Try changing the WORD parameter to ‘cat’, ‘bird’, or ‘water’. Write down what you notice about which words end up close together and why the model might have learned that.

Finally, let’s view all of the corpus on a 2D scatter plot…

# --- PCA visualisation ---
embeddings = np.array([W1[word_to_idx[w]] for w in vocab])
coords = PCA(n_components=2).fit_transform(embeddings)

plt.figure(figsize=(10, 8))
plt.scatter(coords[:, 0], coords[:, 1], s=40, alpha=0.6)
for i, word in enumerate(vocab):
    plt.annotate(word, (coords[i, 0], coords[i, 1]), fontsize=9,
                 xytext=(4, 4), textcoords="offset points")
plt.title("Word2Vec Embeddings (PCA projection)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.tight_layout()
plt.show()

{ “question_type”: “multiple_choice”, “question”: “On the scatter plot, where would you expect ‘dog’ and ‘cat’ to appear relative to each other?”, “options”: [ { “key”: “a”, “text”: “Close together, because they appear near similar words in the training sentences” }, { “key”: “b”, “text”: “Far apart, because they are different animals” }, { “key”: “c”, “text”: “In the same spot, because the model treats all animals the same” }, { “key”: “d”, “text”: “At random positions, because the model doesn’t understand meaning” } ], “answer”: “a”, “submitted_answer”: ““}

{ “question_type”: “multiple_choice”, “question”: “Why does Word2Vec learn that ‘dog’ and ‘cat’ are similar?”, “options”: [ { “key”: “a”, “text”: “Because they both have three letters” }, { “key”: “b”, “text”: “Because they appear near the same words in the training sentences (like ‘the’, ‘love’, ‘and’)” }, { “key”: “c”, “text”: “Because the programmer told the model they were similar” }, { “key”: “d”, “text”: “Because they both appear at the start of sentences” } ], “answer”: “b”, “submitted_answer”: ““}

{ “question_type”: “true_false”, “question”: “Words that appear near the same words in sentences will tend to end up with similar embeddings after training.”, “answer”: “True”, “submitted_answer”: “” }