!uv pip install datasetsUsing Python 3.13.1 environment at: /Users/simon/Dev/CS-394/.venv
Audited 1 package in 24ms
!uv pip install datasetsUsing Python 3.13.1 environment at: /Users/simon/Dev/CS-394/.venv
Audited 1 package in 24ms
Note: If you are running this notebook on Colab, be sure to first upload your training data (.jsonl) files.
TRAIN_FILE = "./train.jsonl"
VALIDATION_FILE = "./validation.jsonl"
TEST_FILE = "./test.jsonl"
DATASET_REPO = "simonguest/test-dataset"from datasets import Dataset, DatasetDict
import json
def load_jsonl(file_path):
data = []
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line: # Skip empty lines
continue
try:
# Try parsing the line
data.append(json.loads(line))
except json.JSONDecodeError as e:
print(f"Warning: Error parsing line {line_num}: {e}")
print(f"Problematic line: {line[:200]}...")
return data
def create_hf_dataset(train_file, val_file, test_file):
# Load the data
train_data = load_jsonl(train_file)
val_data = load_jsonl(val_file)
test_data = load_jsonl(test_file)
# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)
# Combine into DatasetDict
dataset_dict = DatasetDict(
{
"train": train_dataset,
"validation": val_dataset,
"test": test_dataset,
}
)
return dataset_dict
# Create and validate the dataset is ready to upload
dataset = create_hf_dataset(TRAIN_FILE, VALIDATION_FILE, TEST_FILE)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"\nSample entry: {dataset['train'][0]}")Train samples: 5000
Validation samples: 500
Test samples: 10
Sample entry: {'messages': [{'content': 'x = 10\ny = 3.5\nname = "Alex"\nis_student = True', 'role': 'user'}, {'content': 'This code creates four variables: x stores a whole number, y stores a decimal number, name stores text, and is_student stores a True/False value. Each variable holds information that your program can use later.\n\nYou can think of variables like labeled boxes. You place different kinds of items into different boxes, and each box has a name so you can easily find what you stored inside.', 'role': 'assistant'}]}
import sys
import os
from dotenv import load_dotenv
if 'google.colab' in sys.modules:
from google.colab import userdata # type:ignore
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
print("HF API Token set for Colab")
else:
load_dotenv()
print("Loaded env vars from .env")Loaded env vars from .env
def upload_dataset(dataset, repo_name, token):
dataset.push_to_hub(
repo_name,
token=token,
private=False
)
print(f"Dataset uploaded successfully to: https://huggingface.co/datasets/{repo_name}")
upload_dataset(dataset, DATASET_REPO, os.environ.get("HF_TOKEN"))Dataset uploaded successfully to: https://huggingface.co/datasets/simonguest/test-dataset
from huggingface_hub import DatasetCard
card_content = f"""---
---
pretty_name: "Test Dataset for CS-394"
license: mit
---
# Test Dataset
This is a test dataset of Python code snippets and explanations, used in DigiPen's CS-394 course.
"""
card = DatasetCard(card_content)
card.save(f"./README.md")from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj="./README.md",
path_in_repo="README.md",
repo_id=DATASET_REPO,
repo_type="dataset",
)CommitInfo(commit_url='https://huggingface.co/datasets/simonguest/test-dataset/commit/debbc8c9e4e94646b5ee769874a27abc00da093c', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='debbc8c9e4e94646b5ee769874a27abc00da093c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/simonguest/test-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='simonguest/test-dataset'), pr_revision=None, pr_num=None)