Skip to content

Images & Vision

Iris supports two distinct image capabilities: vision (analyzing images with multimodal models) and generation (creating images with models like DALL-E). This guide covers both.

Vision-capable models can analyze images alongside text, enabling use cases like document analysis, image description, visual Q&A, and diagram interpretation.

ProviderVision ModelsMax ImagesMax Size
OpenAIgpt-4o, gpt-4o-mini, gpt-4-turbo2020MB
Anthropicclaude-3-5-sonnet, claude-3-opus, claude-3-haiku205MB
Geminigemini-1.5-pro, gemini-1.5-flash1620MB
xAIgrok-vision-beta110MB
Ollamallava, llava-llama3, bakllava1Varies

Send an image URL for analysis:

package main
import (
"context"
"fmt"
"os"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/openai"
)
func main() {
provider := openai.New(os.Getenv("OPENAI_API_KEY"))
client := core.NewClient(provider)
resp, err := client.Chat("gpt-4o").
System("You are a helpful image analyst.").
UserMultimodal().
Text("What's in this image? Describe it in detail.").
ImageURL("https://example.com/photo.jpg").
Done().
GetResponse(context.Background())
if err != nil {
panic(err)
}
fmt.Println(resp.Output)
}

For local images or when you need to avoid URL fetching:

import (
"encoding/base64"
"os"
)
// Read and encode image
imageData, err := os.ReadFile("photo.png")
if err != nil {
panic(err)
}
base64Data := base64.StdEncoding.EncodeToString(imageData)
// Send with explicit MIME type
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Analyze this image.").
ImageBase64(base64Data, "image/png").
Done().
GetResponse(ctx)
FormatMIME TypeOpenAIAnthropicGemini
PNGimage/png
JPEGimage/jpeg
GIFimage/gif
WebPimage/webp
HEICimage/heic--
HEIFimage/heif--

Analyze multiple images in a single request:

resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Compare these two images. What are the differences?").
ImageURL("https://example.com/before.jpg").
ImageURL("https://example.com/after.jpg").
Done().
GetResponse(ctx)

Mix URLs and base64 in the same request:

localImage, _ := os.ReadFile("local.png")
base64Local := base64.StdEncoding.EncodeToString(localImage)
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Compare the online image with my local screenshot.").
ImageURL("https://example.com/reference.jpg").
ImageBase64(base64Local, "image/png").
Done().
GetResponse(ctx)

Control the detail level for image analysis. Higher detail uses more tokens but provides better analysis of fine details:

// High detail - best for diagrams, text in images, detailed analysis
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Read all the text in this document image.").
ImageURL("https://example.com/document.png", core.ImageDetailHigh).
Done().
GetResponse(ctx)
// Low detail - faster, cheaper, good for simple classification
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Is this a photo of a cat or a dog?").
ImageURL("https://example.com/pet.jpg", core.ImageDetailLow).
Done().
GetResponse(ctx)
// Auto (default) - model decides based on image size
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Describe this image.").
ImageURL("https://example.com/photo.jpg", core.ImageDetailAuto).
Done().
GetResponse(ctx)
resp, err := client.Chat("gpt-4o").
System(`You are a document analysis expert. Extract structured information
from documents including:
- Document type
- Key fields and values
- Any signatures or stamps
- Document date if visible`).
UserMultimodal().
Text("Extract all information from this invoice.").
ImageURL(invoiceURL, core.ImageDetailHigh).
Done().
ResponseFormat(core.ResponseFormatJSON).
GetResponse(ctx)
resp, err := client.Chat("gpt-4o").
System("You analyze charts and graphs, providing insights about trends and data.").
UserMultimodal().
Text("Analyze this sales chart. What trends do you see?").
ImageBase64(chartImage, "image/png").
Done().
GetResponse(ctx)
resp, err := client.Chat("gpt-4o").
System("You compare images and identify differences with precision.").
UserMultimodal().
Text("List all differences between these two UI screenshots.").
ImageURL(beforeURL).
ImageURL(afterURL).
Done().
GetResponse(ctx)

Images can be included in conversation history:

// First turn: analyze image
resp1, _ := client.Chat("gpt-4o").
UserMultimodal().
Text("What's in this image?").
ImageURL(imageURL).
Done().
GetResponse(ctx)
// Second turn: follow-up question about the same image
resp2, _ := client.Chat("gpt-4o").
UserMultimodal().
Text("What's in this image?").
ImageURL(imageURL).
Done().
Assistant(resp1.Output).
User("Can you identify the brand of the car?").
GetResponse(ctx)

Stream responses while analyzing images:

stream, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Describe this image in detail.").
ImageURL(imageURL).
Done().
GetStream(ctx)
if err != nil {
panic(err)
}
for chunk := range stream.Ch {
fmt.Print(chunk.Content)
}
fmt.Println()
if err := <-stream.Err; err != nil {
panic(err)
}

Generate images from text prompts using models like DALL-E.

ProviderModelsSizesFeatures
OpenAIdall-e-3, dall-e-21024x1024, 1792x1024, 1024x1792Generation, variations
package main
import (
"context"
"fmt"
"os"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/openai"
)
func main() {
provider := openai.New(os.Getenv("OPENAI_API_KEY"))
resp, err := provider.GenerateImage(context.Background(), &core.ImageGenerateRequest{
Model: "dall-e-3",
Prompt: "A serene Japanese garden with a koi pond, cherry blossoms, and a small wooden bridge, digital art style",
Size: core.ImageSize1024x1024,
Quality: core.ImageQualityHD,
N: 1,
})
if err != nil {
panic(err)
}
fmt.Printf("Generated image URL: %s\n", resp.Images[0].URL)
fmt.Printf("Revised prompt: %s\n", resp.Images[0].RevisedPrompt)
}
// Square (default)
Size: core.ImageSize1024x1024
// Landscape
Size: core.ImageSize1792x1024
// Portrait
Size: core.ImageSize1024x1792
// DALL-E 2 sizes (smaller)
Size: core.ImageSize512x512
Size: core.ImageSize256x256
// Standard quality (faster, cheaper)
Quality: core.ImageQualityStandard
// HD quality (more detail, better for printing)
Quality: core.ImageQualityHD
// Vivid - hyper-real and dramatic
Style: core.ImageStyleVivid
// Natural - more natural, less hyper-real
Style: core.ImageStyleNatural

Instead of a URL, receive the image data directly:

resp, err := provider.GenerateImage(ctx, &core.ImageGenerateRequest{
Model: "dall-e-3",
Prompt: "A futuristic city skyline at sunset",
Size: core.ImageSize1024x1024,
ResponseFormat: core.ImageResponseFormatB64JSON,
})
if err != nil {
panic(err)
}
// Decode and save
imageData, _ := base64.StdEncoding.DecodeString(resp.Images[0].B64JSON)
os.WriteFile("generated.png", imageData, 0644)

DALL-E 2 can generate multiple variations:

resp, err := provider.GenerateImage(ctx, &core.ImageGenerateRequest{
Model: "dall-e-2",
Prompt: "A cute robot mascot",
Size: core.ImageSize512x512,
N: 4, // Generate 4 variations
})
for i, img := range resp.Images {
fmt.Printf("Variation %d: %s\n", i+1, img.URL)
}

Edit existing images with text prompts (inpainting).

// Read the original image
imageData, _ := os.ReadFile("original.png")
// Optional: read a mask (white areas will be edited)
maskData, _ := os.ReadFile("mask.png")
resp, err := provider.EditImage(ctx, &core.ImageEditRequest{
Model: "dall-e-2",
Prompt: "Add a red balloon floating in the sky",
Image: imageData,
Mask: maskData, // Optional
Size: core.ImageSize1024x1024,
N: 1,
})
if err != nil {
panic(err)
}
fmt.Printf("Edited image: %s\n", resp.Images[0].URL)
  • Must be the same size as the original image
  • Must be a PNG with transparency or white areas indicating where to edit
  • Fully transparent or white pixels will be replaced

Create variations of an existing image:

imageData, _ := os.ReadFile("original.png")
resp, err := provider.CreateImageVariation(ctx, &core.ImageVariationRequest{
Model: "dall-e-2",
Image: imageData,
Size: core.ImageSize1024x1024,
N: 3, // Generate 3 variations
})
for i, img := range resp.Images {
fmt.Printf("Variation %d: %s\n", i+1, img.URL)
}
// Resize large images before sending
// Most models resize internally, but you control quality
// Use low detail for simple tasks
ImageURL(url, core.ImageDetailLow)
// Compress images when using base64
// PNG for screenshots, JPEG for photos
import "image"
import "image/jpeg"
import "bytes"
func resizeForVision(imagePath string, maxDim int) (string, error) {
// Read image
file, _ := os.Open(imagePath)
defer file.Close()
img, _, _ := image.Decode(file)
bounds := img.Bounds()
// Check if resize needed
if bounds.Dx() <= maxDim && bounds.Dy() <= maxDim {
data, _ := os.ReadFile(imagePath)
return base64.StdEncoding.EncodeToString(data), nil
}
// Resize (implementation depends on your image library)
resized := resize(img, maxDim)
// Encode to JPEG for smaller size
var buf bytes.Buffer
jpeg.Encode(&buf, resized, &jpeg.Options{Quality: 85})
return base64.StdEncoding.EncodeToString(buf.Bytes()), nil
}
resp, err := client.Chat("gpt-4o").
UserMultimodal().
Text("Analyze this.").
ImageURL(imageURL).
Done().
GetResponse(ctx)
if err != nil {
var imgErr *core.ImageError
if errors.As(err, &imgErr) {
switch imgErr.Code {
case "invalid_image_format":
log.Println("Unsupported image format")
case "image_too_large":
log.Println("Image exceeds size limit")
case "image_fetch_failed":
log.Println("Could not fetch image from URL")
default:
log.Printf("Image error: %s", imgErr.Message)
}
return
}
// Handle other errors
}
// Vision token costs vary by detail level and image size
//
// Low detail: ~85 tokens per image
// High detail: 85 + 170 * (tiles) tokens
// - Tiles = ceil(width/512) * ceil(height/512)
//
// Example: 2048x2048 image at high detail
// - Tiles: 4 * 4 = 16
// - Tokens: 85 + 170 * 16 = 2,805 tokens
// For batch processing, consider:
// 1. Resize images to needed resolution
// 2. Use low detail when possible
// 3. Batch multiple images per request (up to limits)

Tools Guide

Combine vision with tool calling. Tools →