Supervised Fine-Tuning on CUDA

Dependencies

!uv pip install -q datasets bitsandbytes trl

Model parameters

import os

DATASET_REPO = "simonguest/test-dataset" # Change this to your uploaded dataset location on HF

BASE_MODEL_VENDOR = "Qwen" # Change this to your desired base model
BASE_MODEL_NAME = "Qwen3-1.7B" # Change this to your desired base model
BASE_MODEL = f"{BASE_MODEL_VENDOR}/{BASE_MODEL_NAME}"

PROJECT_NAME = "code-explainer" # Name of your training project
os.environ["WANDB_PROJECT"] = PROJECT_NAME # WandB will use the same project name

MODEL_NAME = f"{BASE_MODEL_NAME}-{PROJECT_NAME}" # Final model name
HF_USERNAME = "simonguest" # Hugging Face username - used to upload your final models

MODEL_FOLDER = f"./models/{MODEL_NAME}" # Local folder for storing the model files during training

Training hyperparameters

BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 512

# LoRA-specific Parameters
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# Use 4-bit quantization for efficiency (QLoRA)
USE_4BIT = True

API keys and tokens

import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
  os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')
  print("HF and WANDB API Tokens set for Colab")
else:
  load_dotenv()
  print("Loaded env vars from .env")

Load dataset from Hugging Face

from datasets import load_dataset
dataset = load_dataset(DATASET_REPO)

Format dataset for correct chat template

from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def format_chat_template(example):
    messages = example["messages"]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

formatted_dataset = dataset.map(
    format_chat_template,
    remove_columns=dataset['train'].column_names # type: ignore
)

print("\nFormatted example:")
print(formatted_dataset['train'][0]['text'][:500])  # type: ignore

Load base model with QLoRA configuration

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configure 4-bit quantization
if USE_4BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
else:
    bnb_config = None

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch.bfloat16,
)

# Prepare model for k-bit training
if USE_4BIT:
    model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Create training configuration and trainer

import os
from transformers import TrainingArguments
from trl import SFTTrainer

report_to = "none"
if os.environ.get("WANDB_API_KEY") != None:
  report_to = "wandb"

# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_FOLDER,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    bf16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    report_to=report_to,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["validation"],
    processing_class=tokenizer,
)

# Save the tokenizer model as this won't change during training
tokenizer.save_pretrained(f"{MODEL_FOLDER}/lora")

Train and save the final model

# Start training
trainer.train()

# Save the final model
trainer.save_model(f"{MODEL_FOLDER}/lora")

Load the final model

from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=f"{MODEL_FOLDER}/lora",
    tokenizer=f"{MODEL_FOLDER}/lora",
    temperature = 0.7,
    max_new_tokens = 256,
)

Test the final model using the test dataset

import random

idx = random.randint(0, len(formatted_dataset['test']) -1)
test_data = formatted_dataset['test'][idx]["text"]

output = pipe(test_data)
print(output[0]['generated_text'])

Create a model card

from huggingface_hub import ModelCard

card_content = f"""---
base_model: {BASE_MODEL}
tags:
- peft
- lora
- text-generation
---

# {MODEL_NAME}

## Model Description
Fine-tuned from `{BASE_MODEL}` using QLoRA (4-bit) with supervised fine-tuning.

## Training Details
- Dataset: `{DATASET_REPO}`
- LoRA rank: {LORA_R}, alpha: {LORA_ALPHA}
- Epochs: {NUM_EPOCHS}, Learning rate: {LEARNING_RATE}

## Intended Use

This model is a test model used for the CS-394/594 class at DigiPen.

The model is designed to provide a summary explanation of a snippet of Python code, to be used in an IDE. This model takes a snippet of code (passed as the user prompt) and returns a two paragraph explanation of what the code does, including an analogy that helps students better understand how the code functions.

## Limitations

This model is a single-turn model and has not been trained on support long, multi-turn conversations.
"""

card = ModelCard(card_content)
card.save(f"{MODEL_FOLDER}/lora/README.md")

Merge the models

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import PeftModel

# Load the configuration and model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    dtype=torch.bfloat16,
)
adapter_model = PeftModel.from_pretrained(base_model, f"{MODEL_FOLDER}/lora")

# Merge and save the model
merged_model = adapter_model.merge_and_unload() # type: ignore
merged_model.save_pretrained(f"{MODEL_FOLDER}/merged")
# Save the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.save_pretrained(f"{MODEL_FOLDER}/merged")

# Copy the model card to the merged folder
!cp {MODEL_FOLDER}/lora/README.md {MODEL_FOLDER}/merged/.

Upload the merged model and adapter to Hugging Face

from huggingface_hub import HfApi, create_repo

# Set the MODEL_REPO
MODEL_REPO = f"{HF_USERNAME}/{MODEL_NAME}"

# Initialize the API
api = HfApi()

# Create the repository (if it doesn't exist)
try:
    create_repo(MODEL_REPO, repo_type="model", exist_ok=True, private=False)
    print(f"Repository {MODEL_REPO} created or already exists")
except Exception as e:
    print(f"Error creating repo: {e}")

# Upload merged model files to root (this is the main model)
print("\nUploading merged model to root...")
api.upload_folder(
    folder_path=f"{MODEL_FOLDER}/merged",
    repo_id=MODEL_REPO,
    repo_type="model",
    path_in_repo="",  # Empty string uploads to root
    commit_message="Upload merged model"
)

# Upload LoRA adapter to subfolder (optional but useful for reference)
print("\nUploading LoRA adapter...")
api.upload_folder(
    folder_path=f"{MODEL_FOLDER}/lora",
    repo_id=MODEL_REPO,
    repo_type="model",
    path_in_repo="lora_adapter",  # Keep adapter in subfolder
    commit_message="Upload LoRA adapter"
)

print(f"\n✓ All files uploaded successfully to https://huggingface.co/{MODEL_REPO}")