Generate Synthetic Training Data

Open In Colab Download .ipynb

Data generation settings

NUM_TRAIN_EXAMPLES = 8000  # @param {type:"number"}
NUM_VAL_EXAMPLES = 1000  # @param {type:"number"}
NUM_TEST_EXAMPLES = 100 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_URL = "https://openrouter.ai/api/v1"
DATAGEN_MODEL = "openai/gpt-5.1-chat"

Dataset diversity

TOPICS = [
    "Strings",
    "input()", 
    "print()",
    "Creating variables",
    "Concatenating strings"
    "Lists",
    "if/else constructs",
    "in operator",
    "list methods: append and remove",
    "list methods: index, pop, and insert",
    "list methods: slicing",
    "list methods: deleting an item",
    "for loops",
    "range() and str()",
    "== comparison operator",
    "len() function",
    "code commenting with #",
    "Comparison operators: !=, >, >=, <, <=",
    "String methods: .lower(), .upper(), .title(), .capitalize()",
    "Using the newline character in strings",
    "int()",
    "float()",
    "elif",
    "import keyword",
    "random module",
    "while keyword",
    "or and not operators",
    "booleans",
    "list methods: .clear(), .copy(), .count(), .extend(), .reverse(), .sort()",
    "Dictionaries",
    "Dictionary methods: .items(), .keys(), .values(), .update(), .pop()",
    "Dictonary methods: .get(), .format()",
    "String methods: .find(), .join(), .replace(), .split(), .swapcase()",
    "Functions: using def and return keywords", 
    "Function methods: .isinstance()",
    "Raising exceptions",
    "Exceptions: TypeError() and ValueError()",
    "Function keywords: as and from",
    "The sys module",
    "The with keyword",
    "Tuples",
    "The lambda keyword",
    "The built-in map function",
    "The time module",
    "Built in methods: __init()__ and __str()__",
    "Double underscore for private methods",
    "Classes"
]

CODE_LENGTH = [
    "short",
    "paragraph",
    "small_function",
    "large_function",
]
CODE_LENGTH_WEIGHTS = [0.25, 0.25, 0.25, 0.25]

Model for structured output

from pydantic import BaseModel

class CodeExplanation(BaseModel):
    code: str
    explanation: str

Get OpenRouter API key

import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OPENROUTER_API_KEY')
else:
  load_dotenv()

Conversation generation functions

import openai
import os

client = openai.OpenAI(
    base_url=DATAGEN_URL,
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str) -> CodeExplanation | None:
    response = client.responses.parse(
        model=DATAGEN_MODEL,
        input=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        stream=False,
        text_format=CodeExplanation
    )

    return response.output_parsed

def create_conversation(topic: str, code_length: str) -> CodeExplanation | None:
    request = ""
    if code_length == "short":
        request = f"2 - 4 lines of Python code about {topic}"
    elif code_length == "paragraph":
        request = f"3 - 6 lines of Python code about {topic}"
    elif code_length == "small_function":
        request = f"a small function (around 10 lines of Python code) about {topic}"
    elif code_length == "large_function":
        request = f"a large function (around 10 - 20 lines of Python code) about {topic}"
    else:
        request = f"a Python code example about {topic}"

    prompt = f"""
        Generate me {request}.

        For this selection of code, generate a short 2 paragraph explanation of what the selected code does:
        - The explanation should be suitable for a high school student learning Python.
        - When it makes sense, the second paragraph of the explanation should use an analogy to help the student better understand the code.
        - DO NOT wrap the code in a ```python block

        Return the following:
        1. The original code as a string. 
        2. Your explanation of what the selected code does as a string.
    """

    return generate_completion(prompt)

Dataset generation functions

import random
import json
from tqdm import tqdm

def generate_dataset(num_examples: int, filename: str) -> None:
  with open(filename, "w", encoding="utf-8") as f:
    for idx in tqdm(range(num_examples)):
      topic = random.choice(TOPICS)
      code_length = random.choices(CODE_LENGTH, weights=CODE_LENGTH_WEIGHTS)[0]

      conversation = None
      while conversation == None:
        conversation = create_conversation(topic, code_length)
        if conversation == None:
          print(f"Error generating conversation for example {idx}")
      
      template = {
          "messages": [
              {"role": "user", "content": conversation.code},
              {
                  "role": "assistant",
                  "content": conversation.explanation,
              },
          ]
      }
      line = json.dumps(template) + "\n"
      f.write(line)
      f.flush()

    f.flush()
    f.close()

Generate all the data!

from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"

generate_dataset(10, TEST_FILE)
100%|██████████| 10/10 [00:41<00:00,  4.12s/it]