Create Hugging Face Dataset

Install dependencies

!uv pip install datasets

Using Python 3.13.1 environment at: /Users/simon/Dev/CS-394/.venv

Audited 1 package in 24ms

Configuration

Note: If you are running this notebook on Colab, be sure to first upload your training data (.jsonl) files.

TRAIN_FILE = "./train.jsonl"
VALIDATION_FILE = "./validation.jsonl"
TEST_FILE = "./test.jsonl"

DATASET_REPO = "simonguest/test-dataset"

Create dataset functions

from datasets import Dataset, DatasetDict
import json


def load_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            try:
                # Try parsing the line
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Warning: Error parsing line {line_num}: {e}")
                print(f"Problematic line: {line[:200]}...")
    return data


def create_hf_dataset(train_file, val_file, test_file):
    # Load the data
    train_data = load_jsonl(train_file)
    val_data = load_jsonl(val_file)
    test_data = load_jsonl(test_file)

    # Create datasets
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    test_dataset = Dataset.from_list(test_data)

    # Combine into DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": train_dataset,
            "validation": val_dataset,
            "test": test_dataset,
        }
    )

    return dataset_dict


# Create and validate the dataset is ready to upload
dataset = create_hf_dataset(TRAIN_FILE, VALIDATION_FILE, TEST_FILE)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"\nSample entry: {dataset['train'][0]}")

Train samples: 5000
Validation samples: 500
Test samples: 10

Sample entry: {'messages': [{'content': 'x = 10\ny = 3.5\nname = "Alex"\nis_student = True', 'role': 'user'}, {'content': 'This code creates four variables: x stores a whole number, y stores a decimal number, name stores text, and is_student stores a True/False value. Each variable holds information that your program can use later.\n\nYou can think of variables like labeled boxes. You place different kinds of items into different boxes, and each box has a name so you can easily find what you stored inside.', 'role': 'assistant'}]}

Get Hugging Face token

import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
  print("HF API Token set for Colab")
else:
  load_dotenv()
  print("Loaded env vars from .env")

Loaded env vars from .env

Upload to Hugging Face

def upload_dataset(dataset, repo_name, token):
    dataset.push_to_hub(
        repo_name,
        token=token,
        private=False
    )
    
    print(f"Dataset uploaded successfully to: https://huggingface.co/datasets/{repo_name}")

upload_dataset(dataset, DATASET_REPO, os.environ.get("HF_TOKEN"))

Dataset uploaded successfully to: https://huggingface.co/datasets/simonguest/test-dataset

Create the dataset card

from huggingface_hub import DatasetCard

card_content = f"""---
---
pretty_name: "Test Dataset for CS-394"
license: mit
---

# Test Dataset

This is a test dataset of Python code snippets and explanations, used in DigiPen's CS-394 course.
"""

card = DatasetCard(card_content)
card.save(f"./README.md")

Upload the dataset card

from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="./README.md",
    path_in_repo="README.md",
    repo_id=DATASET_REPO,
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/simonguest/test-dataset/commit/debbc8c9e4e94646b5ee769874a27abc00da093c', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='debbc8c9e4e94646b5ee769874a27abc00da093c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/simonguest/test-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='simonguest/test-dataset'), pr_revision=None, pr_num=None)