from datasets import Dataset, DatasetDict
import json
def load_jsonl(file_path):
data = []
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line: # Skip empty lines
continue
try:
# Try parsing the line
data.append(json.loads(line))
except json.JSONDecodeError as e:
print(f"Warning: Error parsing line {line_num}: {e}")
print(f"Problematic line: {line[:200]}...")
return data
def create_hf_dataset(train_file, val_file, test_file):
# Load the data
train_data = load_jsonl(train_file)
val_data = load_jsonl(val_file)
test_data = load_jsonl(test_file)
# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)
# Combine into DatasetDict
dataset_dict = DatasetDict(
{
"train": train_dataset,
"validation": val_dataset,
"test": test_dataset,
}
)
return dataset_dict
# Create and validate the dataset is ready to upload
dataset = create_hf_dataset(TRAIN_FILE, VALIDATION_FILE, TEST_FILE)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"\nSample entry: {dataset['train'][0]}")




