Skip to content

Multimodal QA

This example demonstrates how to build applications that combine text and images for richer AI interactions. From analyzing architecture diagrams to processing screenshots for automated testing, multimodal capabilities unlock new use cases.

A visual analysis system that:

  1. Processes images from URLs, files, and base64 data
  2. Analyzes diagrams, charts, screenshots, and documents
  3. Extracts structured information from visual content
  4. Supports multiple images in a single conversation
Use CaseDescription
Architecture reviewAnalyze system diagrams for bottlenecks
UI/UX feedbackReview mockups and screenshots
Document processingExtract data from forms, receipts, invoices
Chart analysisInterpret graphs and visualizations
Accessibility auditIdentify issues in UI screenshots
Terminal window
# Install dependencies
go get github.com/petal-labs/iris
# Set up API key (choose your provider)
iris keys set openai # GPT-4o
iris keys set anthropic # Claude
iris keys set gemini # Gemini Pro Vision
package main
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/openai"
)
// ImageAnalyzer provides visual analysis capabilities
type ImageAnalyzer struct {
client *core.Client
model string
}
// AnalysisResult contains structured analysis output
type AnalysisResult struct {
Description string `json:"description"`
Elements []VisualElement `json:"elements,omitempty"`
Issues []Issue `json:"issues,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
// VisualElement represents a detected component
type VisualElement struct {
Type string `json:"type"`
Description string `json:"description"`
Location string `json:"location,omitempty"`
}
// Issue represents a detected problem
type Issue struct {
Severity string `json:"severity"`
Description string `json:"description"`
Suggestion string `json:"suggestion,omitempty"`
}
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
analyzer, err := NewImageAnalyzer()
if err != nil {
log.Fatal(err)
}
// Example 1: Analyze an architecture diagram from URL
fmt.Println("=== Architecture Diagram Analysis ===")
result, err := analyzer.AnalyzeURL(ctx,
"https://example.com/architecture-diagram.png",
"Analyze this system architecture. Identify components, data flows, and potential bottlenecks.",
)
if err != nil {
log.Printf("URL analysis error: %v", err)
} else {
fmt.Println(result.Description)
}
// Example 2: Analyze a local file
fmt.Println("\n=== Screenshot Analysis ===")
result, err = analyzer.AnalyzeFile(ctx,
"./screenshots/dashboard.png",
"Review this dashboard UI. Check for usability issues and suggest improvements.",
)
if err != nil {
log.Printf("File analysis error: %v", err)
} else {
fmt.Println(result.Description)
for _, issue := range result.Issues {
fmt.Printf("- [%s] %s\n", issue.Severity, issue.Description)
}
}
// Example 3: Compare multiple images
fmt.Println("\n=== Design Comparison ===")
result, err = analyzer.CompareImages(ctx,
[]string{"./mockups/v1.png", "./mockups/v2.png"},
"Compare these two design versions. What changed? Which is better for usability?",
)
if err != nil {
log.Printf("Comparison error: %v", err)
} else {
fmt.Println(result.Description)
}
// Example 4: Extract structured data
fmt.Println("\n=== Data Extraction ===")
data, err := analyzer.ExtractData(ctx,
"./documents/invoice.png",
InvoiceSchema,
)
if err != nil {
log.Printf("Extraction error: %v", err)
} else {
fmt.Printf("Invoice Data: %+v\n", data)
}
}
// NewImageAnalyzer creates an analyzer with the configured provider
func NewImageAnalyzer() (*ImageAnalyzer, error) {
provider, err := openai.NewFromKeystore()
if err != nil {
provider, err = openai.NewFromEnv()
if err != nil {
return nil, fmt.Errorf("openai not configured: %w", err)
}
}
client := core.NewClient(provider,
core.WithRetryPolicy(&core.RetryPolicy{
MaxRetries: 3,
InitialInterval: 1 * time.Second,
MaxInterval: 30 * time.Second,
BackoffMultiplier: 2.0,
RetryOn: []int{429, 500, 503},
}),
)
return &ImageAnalyzer{
client: client,
model: "gpt-4o",
}, nil
}
// AnalyzeURL analyzes an image from a URL
func (a *ImageAnalyzer) AnalyzeURL(ctx context.Context, imageURL, prompt string) (*AnalysisResult, error) {
resp, err := a.client.Chat(a.model).
System("You are an expert visual analyst. Provide detailed, actionable analysis.").
UserMultimodal().
Text(prompt).
ImageURL(imageURL).
Done().
Temperature(0.3).
GetResponse(ctx)
if err != nil {
return nil, fmt.Errorf("analysis failed: %w", err)
}
return &AnalysisResult{
Description: resp.Output,
}, nil
}
// AnalyzeFile analyzes an image from a local file
func (a *ImageAnalyzer) AnalyzeFile(ctx context.Context, filePath, prompt string) (*AnalysisResult, error) {
// Read and encode the file
data, err := os.ReadFile(filePath)
if err != nil {
return nil, fmt.Errorf("reading file: %w", err)
}
base64Data := base64.StdEncoding.EncodeToString(data)
mimeType := getMimeType(filePath)
resp, err := a.client.Chat(a.model).
System(`You are an expert visual analyst. When analyzing images:
1. Describe what you see in detail
2. Identify any issues or problems
3. Provide specific, actionable suggestions
Format issues as: [SEVERITY] Description - Suggestion`).
UserMultimodal().
Text(prompt).
ImageBase64(base64Data, mimeType).
Done().
Temperature(0.3).
GetResponse(ctx)
if err != nil {
return nil, fmt.Errorf("analysis failed: %w", err)
}
// Parse issues from the response
issues := parseIssues(resp.Output)
return &AnalysisResult{
Description: resp.Output,
Issues: issues,
}, nil
}
// CompareImages analyzes multiple images together
func (a *ImageAnalyzer) CompareImages(ctx context.Context, filePaths []string, prompt string) (*AnalysisResult, error) {
builder := a.client.Chat(a.model).
System("You are an expert at comparing visual designs. Analyze differences and provide recommendations.").
UserMultimodal().
Text(prompt)
// Add each image
for i, path := range filePaths {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("reading file %d: %w", i, err)
}
base64Data := base64.StdEncoding.EncodeToString(data)
mimeType := getMimeType(path)
builder = builder.ImageBase64(base64Data, mimeType)
}
resp, err := builder.Done().
Temperature(0.3).
GetResponse(ctx)
if err != nil {
return nil, fmt.Errorf("comparison failed: %w", err)
}
return &AnalysisResult{
Description: resp.Output,
}, nil
}
// Schema for structured data extraction
type ExtractionSchema struct {
Fields []SchemaField `json:"fields"`
}
type SchemaField struct {
Name string `json:"name"`
Type string `json:"type"`
Description string `json:"description"`
}
var InvoiceSchema = ExtractionSchema{
Fields: []SchemaField{
{Name: "vendor", Type: "string", Description: "Company or person who issued the invoice"},
{Name: "invoice_number", Type: "string", Description: "Invoice number or ID"},
{Name: "date", Type: "string", Description: "Invoice date in ISO format"},
{Name: "total", Type: "number", Description: "Total amount"},
{Name: "currency", Type: "string", Description: "Currency code (USD, EUR, etc.)"},
{Name: "items", Type: "array", Description: "Line items with description and amount"},
},
}
// ExtractData extracts structured data from an image
func (a *ImageAnalyzer) ExtractData(ctx context.Context, filePath string, schema ExtractionSchema) (map[string]interface{}, error) {
data, err := os.ReadFile(filePath)
if err != nil {
return nil, fmt.Errorf("reading file: %w", err)
}
base64Data := base64.StdEncoding.EncodeToString(data)
mimeType := getMimeType(filePath)
// Build extraction prompt
var fieldDescs []string
for _, f := range schema.Fields {
fieldDescs = append(fieldDescs, fmt.Sprintf("- %s (%s): %s", f.Name, f.Type, f.Description))
}
prompt := fmt.Sprintf(`Extract the following information from this document:
%s
Return the data as a JSON object with these exact field names.
If a field cannot be determined, use null.`, strings.Join(fieldDescs, "\n"))
resp, err := a.client.Chat(a.model).
System("You are a data extraction specialist. Extract structured data from documents accurately. Always respond with valid JSON.").
UserMultimodal().
Text(prompt).
ImageBase64(base64Data, mimeType).
Done().
Temperature(0.0).
JSONMode(true).
GetResponse(ctx)
if err != nil {
return nil, fmt.Errorf("extraction failed: %w", err)
}
// Parse JSON response
var result map[string]interface{}
if err := json.Unmarshal([]byte(resp.Output), &result); err != nil {
return nil, fmt.Errorf("parsing response: %w", err)
}
return result, nil
}
// Helper functions
func getMimeType(path string) string {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".jpg", ".jpeg":
return "image/jpeg"
case ".png":
return "image/png"
case ".gif":
return "image/gif"
case ".webp":
return "image/webp"
default:
return "image/png"
}
}
func parseIssues(text string) []Issue {
var issues []Issue
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "[") {
// Parse [SEVERITY] format
if idx := strings.Index(line, "]"); idx > 0 {
severity := strings.ToLower(strings.Trim(line[1:idx], " "))
description := strings.TrimSpace(line[idx+1:])
var suggestion string
if dashIdx := strings.Index(description, " - "); dashIdx > 0 {
suggestion = strings.TrimSpace(description[dashIdx+3:])
description = strings.TrimSpace(description[:dashIdx])
}
issues = append(issues, Issue{
Severity: severity,
Description: description,
Suggestion: suggestion,
})
}
}
}
return issues
}

Iris supports multiple ways to include images:

// From URL
builder.UserMultimodal().
Text("Describe this image").
ImageURL("https://example.com/image.png").
Done()
// From base64-encoded data
builder.UserMultimodal().
Text("Describe this image").
ImageBase64(base64Data, "image/png").
Done()
// Multiple images
builder.UserMultimodal().
Text("Compare these images").
ImageURL("https://example.com/before.png").
ImageURL("https://example.com/after.png").
Done()

Different providers offer unique capabilities:

ProviderFeatureMethod
OpenAIHigh/low detail modeImageURL(url, core.ImageDetailHigh)
ClaudePDF processingFile(data, "application/pdf")
GeminiVideo analysisVideo(data, "video/mp4")
GeminiAudio analysisAudio(data, "audio/mp3")

Use JSON mode for reliable data extraction:

resp, err := client.Chat("gpt-4o").
System("Extract data as JSON.").
UserMultimodal().
Text("Extract the invoice details.").
ImageBase64(base64Data, "image/png").
Done().
JSONMode(true).
GetResponse(ctx)
var data map[string]interface{}
json.Unmarshal([]byte(resp.Output), &data)

Optimize images before sending:

import "image"
import "image/jpeg"
func preprocessImage(path string, maxSize int) ([]byte, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
img, _, err := image.Decode(file)
if err != nil {
return nil, err
}
// Resize if needed
bounds := img.Bounds()
if bounds.Dx() > maxSize || bounds.Dy() > maxSize {
img = resize(img, maxSize)
}
// Encode as JPEG for smaller size
var buf bytes.Buffer
if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: 85}); err != nil {
return nil, err
}
return buf.Bytes(), nil
}

Process multiple images efficiently:

func (a *ImageAnalyzer) BatchAnalyze(ctx context.Context, paths []string, prompt string) ([]AnalysisResult, error) {
results := make([]AnalysisResult, len(paths))
errors := make([]error, len(paths))
var wg sync.WaitGroup
sem := make(chan struct{}, 5) // Limit concurrency
for i, path := range paths {
wg.Add(1)
go func(idx int, p string) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
result, err := a.AnalyzeFile(ctx, p, prompt)
if err != nil {
errors[idx] = err
return
}
results[idx] = *result
}(i, path)
}
wg.Wait()
// Check for errors
for _, err := range errors {
if err != nil {
return results, fmt.Errorf("batch had errors: %w", err)
}
}
return results, nil
}

Cache analysis to avoid redundant API calls:

type AnalysisCache struct {
cache map[string]*AnalysisResult
mu sync.RWMutex
}
func (c *AnalysisCache) GetOrAnalyze(ctx context.Context, analyzer *ImageAnalyzer, path, prompt string) (*AnalysisResult, error) {
// Create cache key from file hash + prompt
key := hashFile(path) + ":" + hashString(prompt)
c.mu.RLock()
if result, ok := c.cache[key]; ok {
c.mu.RUnlock()
return result, nil
}
c.mu.RUnlock()
result, err := analyzer.AnalyzeFile(ctx, path, prompt)
if err != nil {
return nil, err
}
c.mu.Lock()
c.cache[key] = result
c.mu.Unlock()
return result, nil
}

Handle image-specific errors:

func (a *ImageAnalyzer) AnalyzeWithRetry(ctx context.Context, path, prompt string) (*AnalysisResult, error) {
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
result, err := a.AnalyzeFile(ctx, path, prompt)
if err == nil {
return result, nil
}
lastErr = err
// Check for specific errors
var apiErr *core.APIError
if errors.As(err, &apiErr) {
switch apiErr.StatusCode {
case 400:
// Bad request - likely invalid image
return nil, fmt.Errorf("invalid image: %w", err)
case 413:
// Image too large
return nil, fmt.Errorf("image too large (max 20MB): %w", err)
case 429:
// Rate limited - wait and retry
time.Sleep(time.Duration(attempt+1) * 5 * time.Second)
continue
}
}
// Unknown error - retry with backoff
time.Sleep(time.Duration(attempt+1) * time.Second)
}
return nil, fmt.Errorf("analysis failed after retries: %w", lastErr)
}
result, err := analyzer.AnalyzeURL(ctx,
"https://example.com/architecture.png",
`Analyze this system architecture diagram:
1. Identify all components and their roles
2. Trace the data flow from user to database
3. Identify single points of failure
4. Suggest improvements for scalability
Format as: Component Analysis, Data Flow, Risks, Recommendations`,
)
result, err := analyzer.AnalyzeFile(ctx,
"./screenshots/login-page.png",
`Perform an accessibility audit:
1. Check color contrast ratios
2. Identify missing labels or alt text indicators
3. Evaluate touch target sizes
4. Check for clear visual hierarchy
Format issues as: [WCAG-LEVEL] Issue - Recommendation`,
)
schema := ExtractionSchema{
Fields: []SchemaField{
{Name: "store_name", Type: "string", Description: "Name of the store"},
{Name: "date", Type: "string", Description: "Transaction date"},
{Name: "items", Type: "array", Description: "List of items with name, qty, price"},
{Name: "subtotal", Type: "number", Description: "Subtotal before tax"},
{Name: "tax", Type: "number", Description: "Tax amount"},
{Name: "total", Type: "number", Description: "Total amount"},
{Name: "payment_method", Type: "string", Description: "How payment was made"},
},
}
data, err := analyzer.ExtractData(ctx, "./receipts/grocery.jpg", schema)
=== Architecture Diagram Analysis ===
The diagram shows a three-tier web application with the following components:
**Components:**
- Load Balancer (nginx) - Entry point, distributes traffic
- API Servers (3x) - Stateless application logic
- Redis Cache - Session storage and caching
- PostgreSQL Primary - Main database
- PostgreSQL Replica - Read replica
**Data Flow:**
User → Load Balancer → API Server → Cache/Database
**Potential Bottlenecks:**
1. Single Redis instance - no failover
2. Database writes concentrated on primary
3. No CDN for static assets
**Recommendations:**
- Add Redis Sentinel for cache HA
- Consider read replicas for heavy read operations
- Implement connection pooling (PgBouncer)
=== Screenshot Analysis ===
Dashboard UI review completed.
- [HIGH] Low contrast text - White text on light gray background fails WCAG AA. Use #333 on #fff.
- [MEDIUM] Small touch targets - Filter buttons are 32px. Increase to 44px minimum.
- [LOW] Missing loading states - Charts show no skeleton loaders.
PracticeRecommendation
Image sizeResize to max 2048px for faster processing
FormatUse JPEG for photos, PNG for diagrams
PromptsBe specific about what to analyze
TemperatureUse 0.0-0.3 for factual extraction
JSON modeEnable for structured data extraction
BatchingLimit concurrent requests to 5-10

OpenAI Vision

GPT-4o vision capabilities. OpenAI →