from openai import OpenAI
import os
import base64
from PIL import Image
from io import BytesIO
from IPython.display import display
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
)
VLM_MODEL = "qwen/qwen3-vl-8b-instruct"
def display_image(image_url):
url = image_url
if url.startswith("data:"):
url = url.split(",", 1)[1]
image_data = base64.b64decode(url)
image = Image.open(BytesIO(image_data))
display(image)
def ask_vlm(image_url, question):
response = client.chat.completions.create(
model=VLM_MODEL,
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": question},
]
}],
stream=True
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end='', flush=True)
print()Visual Reasoning with a VLM
In the previous notebook, we asked a Vision Language Model (VLM) to describe an image. Here we go further — asking the model to reason about what it sees: answering specific questions, counting objects, and reading text.
Step 1: Capture an Image
Run the cell below to take a photo. Point your camera at something interesting — a room, an object, a book with visible text, or yourself.
import cv
import graphics
import time
canvas = graphics.canvas()
camera = cv.start_camera(canvas)
w = canvas.get_width()
h = canvas.get_height()
ctx = canvas.get_context('2d')
for count in ["3", "2", "1"]:
ctx.fill_style = "rgba(0, 0, 0, 0.5)"
ctx.fill_rect(0, 0, w, h)
ctx.font = "bold 200px sans-serif"
ctx.fill_style = "white"
ctx.text_align = "center"
ctx.text_baseline = "middle"
ctx.fill_text(count, w / 2, h / 2)
time.sleep(1)
canvas.clear()
data_url = cv.capture_frame(camera)
camera.stop()
print("Photo captured!")Step 2: Ask a Reasoning Question
These questions go beyond simple description — they ask the model to think about what it sees. Choose one from the dropdown, or edit the cell to ask your own question.
QUESTION = "What is the dominant color in this image?" #@param ["What is the dominant color in this image?", "What emotion or mood does this image convey?", "If this were a movie scene what genre would it be?", "What would happen next if this were a video?", "Describe this image as if explaining it to someone who cannot see"]
ask_vlm(data_url, QUESTION)Step 3: Counting
VLMs can count objects in an image. Change the object below and see how accurate the model is — then check it yourself.
OBJECT = "people" #@param
ask_vlm(data_url, f"Count the number of {OBJECT} in this image. Give a specific number and briefly explain what you counted.")Step 4: Reading Text
VLMs can read and transcribe text from images — this is called OCR (Optical Character Recognition). For best results, retake your photo pointing at something with visible text: a book, a sign, a poster, or a screen.
ask_vlm(data_url, "Read and transcribe all text that is visible in this image. If there is no text, say so.")Think about all the tasks the VLM just completed: reasoning, counting, and reading text.
Which task impressed you most? Where did the model make a mistake or struggle? What other visual reasoning task would you want to try?
{ “question_type”: “true_false”, “question”: “A Vision Language Model (VLM) can only describe images — it cannot count objects or answer specific questions about them.”, “answer”: “False”, “submitted_answer”: “” }
{ “question_type”: “multiple_choice”, “question”: “What does OCR stand for?”, “options”: [ { “key”: “a”, “text”: “Optical Color Recognition” }, { “key”: “b”, “text”: “Output Content Rendering” }, { “key”: “c”, “text”: “Optical Character Recognition” }, { “key”: “d”, “text”: “Object and Caption Recognition” } ], “answer”: “c”, “submitted_answer”: “” }
{ “question_type”: “freeform”, “question”: “What does VLM stand for?”, “answer”: “Vision Language Model”, “submitted_answer”: “” }