Text-to-Image using Stable Diffusion 1.5

Open In Colab Download .ipynb

import torch
from diffusers import StableDiffusionPipeline

# Load a small diffusion model
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipe.to(device)
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:104: UserWarning: 
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).
  warnings.warn(
Keyword arguments {'generator': <torch._C.Generator object at 0x7c30c9746a30>} are not expected by StableDiffusionPipeline and will be ignored.

Show Intermediate Steps

import matplotlib.pyplot as plt
import numpy as np

PROMPT = "a photograph of an astronaut riding a horse" #@param {type:"string"}
STEPS = 50 #@param {type:"slider", min:10, max:100, step:1}
SEED = -1 #@param {type:"integer"}

intermediate_images = []

def callback_fn(step, timestep, latents):
    """Capture intermediate denoising steps"""
    # Decode latents to image every few steps
    if step % 5 == 0 or step == 0:
        with torch.no_grad():
            # Decode the latent representation to an image
            image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
            image = pipe.image_processor.postprocess(image, output_type="pil")[0]
            intermediate_images.append((step, image))

result = pipe(
    PROMPT,
    num_inference_steps=STEPS,
    callback=callback_fn,
    callback_steps=1,
    generator=torch.Generator().manual_seed(SEED) if SEED != -1 else None,
).images[0]

# Visualize the denoising process
num_steps_to_show = min(10, len(intermediate_images))
step_indices = np.linspace(0, len(intermediate_images)-1, num_steps_to_show, dtype=int)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle(f'Real Diffusion Model Denoising Process\nPrompt: "{PROMPT}"')

for idx, step_idx in enumerate(step_indices):
    row = idx // 5
    col = idx % 5
    step_num, img = intermediate_images[step_idx]

    axes[row, col].imshow(img)
    axes[row, col].axis('off')
    axes[row, col].set_title(f'Step {step_num}/{STEPS}')

plt.tight_layout()
plt.savefig('diffusion_process.png', dpi=150, bbox_inches='tight')
plt.show()

Show Final Image

plt.figure(figsize=(8, 8))
plt.imshow(result)
plt.axis('off')
plt.title('Final Generated Image')
plt.savefig('final_result.png', dpi=150, bbox_inches='tight')
plt.show()