NUM_TRAIN_EXAMPLES = 8000 # @param {type:"number"}
NUM_VAL_EXAMPLES = 1000 # @param {type:"number"}
NUM_TEST_EXAMPLES = 100 # @param {type:"number"}
TEMPERATURE = 0.8 # @param {type:"number"}
DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}
DATAGEN_URL = "https://openrouter.ai/api/v1"
DATAGEN_MODEL = "openai/gpt-5.1-chat"Generate Synthetic Training Data
Data generation settings
Dataset diversity
TOPICS = [
"Strings",
"input()",
"print()",
"Creating variables",
"Concatenating strings"
"Lists",
"if/else constructs",
"in operator",
"list methods: append and remove",
"list methods: index, pop, and insert",
"list methods: slicing",
"list methods: deleting an item",
"for loops",
"range() and str()",
"== comparison operator",
"len() function",
"code commenting with #",
"Comparison operators: !=, >, >=, <, <=",
"String methods: .lower(), .upper(), .title(), .capitalize()",
"Using the newline character in strings",
"int()",
"float()",
"elif",
"import keyword",
"random module",
"while keyword",
"or and not operators",
"booleans",
"list methods: .clear(), .copy(), .count(), .extend(), .reverse(), .sort()",
"Dictionaries",
"Dictionary methods: .items(), .keys(), .values(), .update(), .pop()",
"Dictonary methods: .get(), .format()",
"String methods: .find(), .join(), .replace(), .split(), .swapcase()",
"Functions: using def and return keywords",
"Function methods: .isinstance()",
"Raising exceptions",
"Exceptions: TypeError() and ValueError()",
"Function keywords: as and from",
"The sys module",
"The with keyword",
"Tuples",
"The lambda keyword",
"The built-in map function",
"The time module",
"Built in methods: __init()__ and __str()__",
"Double underscore for private methods",
"Classes"
]
CODE_LENGTH = [
"short",
"paragraph",
"small_function",
"large_function",
]
CODE_LENGTH_WEIGHTS = [0.25, 0.25, 0.25, 0.25]Model for structured output
from pydantic import BaseModel
class CodeExplanation(BaseModel):
code: str
explanation: strGet OpenRouter API key
import sys
import os
from dotenv import load_dotenv
if 'google.colab' in sys.modules:
from google.colab import userdata # type:ignore
os.environ['OPENROUTER_API_KEY'] = userdata.get('OPENROUTER_API_KEY')
else:
load_dotenv()Conversation generation functions
import openai
import os
client = openai.OpenAI(
base_url=DATAGEN_URL,
api_key=os.environ.get("OPENROUTER_API_KEY"),
)
def generate_completion(prompt: str) -> CodeExplanation | None:
response = client.responses.parse(
model=DATAGEN_MODEL,
input=[{"role": "user", "content": prompt}],
temperature=TEMPERATURE,
stream=False,
text_format=CodeExplanation
)
return response.output_parsed
def create_conversation(topic: str, code_length: str) -> CodeExplanation | None:
request = ""
if code_length == "short":
request = f"2 - 4 lines of Python code about {topic}"
elif code_length == "paragraph":
request = f"3 - 6 lines of Python code about {topic}"
elif code_length == "small_function":
request = f"a small function (around 10 lines of Python code) about {topic}"
elif code_length == "large_function":
request = f"a large function (around 10 - 20 lines of Python code) about {topic}"
else:
request = f"a Python code example about {topic}"
prompt = f"""
Generate me {request}.
For this selection of code, generate a short 2 paragraph explanation of what the selected code does:
- The explanation should be suitable for a high school student learning Python.
- When it makes sense, the second paragraph of the explanation should use an analogy to help the student better understand the code.
- DO NOT wrap the code in a ```python block
Return the following:
1. The original code as a string.
2. Your explanation of what the selected code does as a string.
"""
return generate_completion(prompt)Dataset generation functions
import random
import json
from tqdm import tqdm
def generate_dataset(num_examples: int, filename: str) -> None:
with open(filename, "w", encoding="utf-8") as f:
for idx in tqdm(range(num_examples)):
topic = random.choice(TOPICS)
code_length = random.choices(CODE_LENGTH, weights=CODE_LENGTH_WEIGHTS)[0]
conversation = None
while conversation == None:
conversation = create_conversation(topic, code_length)
if conversation == None:
print(f"Error generating conversation for example {idx}")
template = {
"messages": [
{"role": "user", "content": conversation.code},
{
"role": "assistant",
"content": conversation.explanation,
},
]
}
line = json.dumps(template) + "\n"
f.write(line)
f.flush()
f.flush()
f.close()Generate all the data!
from datetime import datetime
TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
generate_dataset(10, TEST_FILE)100%|██████████| 10/10 [00:41<00:00, 4.12s/it]