Multimodal QA

This example demonstrates how to build applications that combine text and images for richer AI interactions. From analyzing architecture diagrams to processing screenshots for automated testing, multimodal capabilities unlock new use cases.

What You’ll Build

A visual analysis system that:

Processes images from URLs, files, and base64 data
Analyzes diagrams, charts, screenshots, and documents
Extracts structured information from visual content
Supports multiple images in a single conversation

Use Cases

Use Case	Description
Architecture review	Analyze system diagrams for bottlenecks
UI/UX feedback	Review mockups and screenshots
Document processing	Extract data from forms, receipts, invoices
Chart analysis	Interpret graphs and visualizations
Accessibility audit	Identify issues in UI screenshots

Prerequisites

# Install dependencies
go get github.com/petal-labs/iris

# Set up API key (choose your provider)
iris keys set openai    # GPT-4o
iris keys set anthropic # Claude
iris keys set gemini    # Gemini Pro Vision

Complete Implementation

package main

import (
    "context"
    "encoding/base64"
    "encoding/json"
    "fmt"
    "io"
    "log"
    "net/http"
    "os"
    "path/filepath"
    "strings"
    "time"

    "github.com/petal-labs/iris/core"
    "github.com/petal-labs/iris/providers/openai"
)

// ImageAnalyzer provides visual analysis capabilities
type ImageAnalyzer struct {
    client *core.Client
    model  string
}

// AnalysisResult contains structured analysis output
type AnalysisResult struct {
    Description string              `json:"description"`
    Elements    []VisualElement     `json:"elements,omitempty"`
    Issues      []Issue             `json:"issues,omitempty"`
    Metadata    map[string]string   `json:"metadata,omitempty"`
}

// VisualElement represents a detected component
type VisualElement struct {
    Type        string `json:"type"`
    Description string `json:"description"`
    Location    string `json:"location,omitempty"`
}

// Issue represents a detected problem
type Issue struct {
    Severity    string `json:"severity"`
    Description string `json:"description"`
    Suggestion  string `json:"suggestion,omitempty"`
}

func main() {
    ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
    defer cancel()

    analyzer, err := NewImageAnalyzer()
    if err != nil {
        log.Fatal(err)
    }

    // Example 1: Analyze an architecture diagram from URL
    fmt.Println("=== Architecture Diagram Analysis ===")
    result, err := analyzer.AnalyzeURL(ctx,
        "https://example.com/architecture-diagram.png",
        "Analyze this system architecture. Identify components, data flows, and potential bottlenecks.",
    )
    if err != nil {
        log.Printf("URL analysis error: %v", err)
    } else {
        fmt.Println(result.Description)
    }

    // Example 2: Analyze a local file
    fmt.Println("\n=== Screenshot Analysis ===")
    result, err = analyzer.AnalyzeFile(ctx,
        "./screenshots/dashboard.png",
        "Review this dashboard UI. Check for usability issues and suggest improvements.",
    )
    if err != nil {
        log.Printf("File analysis error: %v", err)
    } else {
        fmt.Println(result.Description)
        for _, issue := range result.Issues {
            fmt.Printf("- [%s] %s\n", issue.Severity, issue.Description)
        }
    }

    // Example 3: Compare multiple images
    fmt.Println("\n=== Design Comparison ===")
    result, err = analyzer.CompareImages(ctx,
        []string{"./mockups/v1.png", "./mockups/v2.png"},
        "Compare these two design versions. What changed? Which is better for usability?",
    )
    if err != nil {
        log.Printf("Comparison error: %v", err)
    } else {
        fmt.Println(result.Description)
    }

    // Example 4: Extract structured data
    fmt.Println("\n=== Data Extraction ===")
    data, err := analyzer.ExtractData(ctx,
        "./documents/invoice.png",
        InvoiceSchema,
    )
    if err != nil {
        log.Printf("Extraction error: %v", err)
    } else {
        fmt.Printf("Invoice Data: %+v\n", data)
    }
}

// NewImageAnalyzer creates an analyzer with the configured provider
func NewImageAnalyzer() (*ImageAnalyzer, error) {
    provider, err := openai.NewFromKeystore()
    if err != nil {
        provider, err = openai.NewFromEnv()
        if err != nil {
            return nil, fmt.Errorf("openai not configured: %w", err)
        }
    }

    client := core.NewClient(provider,
        core.WithRetryPolicy(&core.RetryPolicy{
            MaxRetries:        3,
            InitialInterval:   1 * time.Second,
            MaxInterval:       30 * time.Second,
            BackoffMultiplier: 2.0,
            RetryOn:           []int{429, 500, 503},
        }),
    )

    return &ImageAnalyzer{
        client: client,
        model:  "gpt-4o",
    }, nil
}

// AnalyzeURL analyzes an image from a URL
func (a *ImageAnalyzer) AnalyzeURL(ctx context.Context, imageURL, prompt string) (*AnalysisResult, error) {
    resp, err := a.client.Chat(a.model).
        System("You are an expert visual analyst. Provide detailed, actionable analysis.").
        UserMultimodal().
            Text(prompt).
            ImageURL(imageURL).
            Done().
        Temperature(0.3).
        GetResponse(ctx)

    if err != nil {
        return nil, fmt.Errorf("analysis failed: %w", err)
    }

    return &AnalysisResult{
        Description: resp.Output,
    }, nil
}

// AnalyzeFile analyzes an image from a local file
func (a *ImageAnalyzer) AnalyzeFile(ctx context.Context, filePath, prompt string) (*AnalysisResult, error) {
    // Read and encode the file
    data, err := os.ReadFile(filePath)
    if err != nil {
        return nil, fmt.Errorf("reading file: %w", err)
    }

    base64Data := base64.StdEncoding.EncodeToString(data)
    mimeType := getMimeType(filePath)

    resp, err := a.client.Chat(a.model).
        System(`You are an expert visual analyst. When analyzing images:
1. Describe what you see in detail
2. Identify any issues or problems
3. Provide specific, actionable suggestions
Format issues as: [SEVERITY] Description - Suggestion`).
        UserMultimodal().
            Text(prompt).
            ImageBase64(base64Data, mimeType).
            Done().
        Temperature(0.3).
        GetResponse(ctx)

    if err != nil {
        return nil, fmt.Errorf("analysis failed: %w", err)
    }

    // Parse issues from the response
    issues := parseIssues(resp.Output)

    return &AnalysisResult{
        Description: resp.Output,
        Issues:      issues,
    }, nil
}

// CompareImages analyzes multiple images together
func (a *ImageAnalyzer) CompareImages(ctx context.Context, filePaths []string, prompt string) (*AnalysisResult, error) {
    builder := a.client.Chat(a.model).
        System("You are an expert at comparing visual designs. Analyze differences and provide recommendations.").
        UserMultimodal().
        Text(prompt)

    // Add each image
    for i, path := range filePaths {
        data, err := os.ReadFile(path)
        if err != nil {
            return nil, fmt.Errorf("reading file %d: %w", i, err)
        }

        base64Data := base64.StdEncoding.EncodeToString(data)
        mimeType := getMimeType(path)
        builder = builder.ImageBase64(base64Data, mimeType)
    }

    resp, err := builder.Done().
        Temperature(0.3).
        GetResponse(ctx)

    if err != nil {
        return nil, fmt.Errorf("comparison failed: %w", err)
    }

    return &AnalysisResult{
        Description: resp.Output,
    }, nil
}

// Schema for structured data extraction
type ExtractionSchema struct {
    Fields []SchemaField `json:"fields"`
}

type SchemaField struct {
    Name        string `json:"name"`
    Type        string `json:"type"`
    Description string `json:"description"`
}

var InvoiceSchema = ExtractionSchema{
    Fields: []SchemaField{
        {Name: "vendor", Type: "string", Description: "Company or person who issued the invoice"},
        {Name: "invoice_number", Type: "string", Description: "Invoice number or ID"},
        {Name: "date", Type: "string", Description: "Invoice date in ISO format"},
        {Name: "total", Type: "number", Description: "Total amount"},
        {Name: "currency", Type: "string", Description: "Currency code (USD, EUR, etc.)"},
        {Name: "items", Type: "array", Description: "Line items with description and amount"},
    },
}

// ExtractData extracts structured data from an image
func (a *ImageAnalyzer) ExtractData(ctx context.Context, filePath string, schema ExtractionSchema) (map[string]interface{}, error) {
    data, err := os.ReadFile(filePath)
    if err != nil {
        return nil, fmt.Errorf("reading file: %w", err)
    }

    base64Data := base64.StdEncoding.EncodeToString(data)
    mimeType := getMimeType(filePath)

    // Build extraction prompt
    var fieldDescs []string
    for _, f := range schema.Fields {
        fieldDescs = append(fieldDescs, fmt.Sprintf("- %s (%s): %s", f.Name, f.Type, f.Description))
    }

    prompt := fmt.Sprintf(`Extract the following information from this document:
%s

Return the data as a JSON object with these exact field names.
If a field cannot be determined, use null.`, strings.Join(fieldDescs, "\n"))

    resp, err := a.client.Chat(a.model).
        System("You are a data extraction specialist. Extract structured data from documents accurately. Always respond with valid JSON.").
        UserMultimodal().
            Text(prompt).
            ImageBase64(base64Data, mimeType).
            Done().
        Temperature(0.0).
        JSONMode(true).
        GetResponse(ctx)

    if err != nil {
        return nil, fmt.Errorf("extraction failed: %w", err)
    }

    // Parse JSON response
    var result map[string]interface{}
    if err := json.Unmarshal([]byte(resp.Output), &result); err != nil {
        return nil, fmt.Errorf("parsing response: %w", err)
    }

    return result, nil
}

// Helper functions

func getMimeType(path string) string {
    ext := strings.ToLower(filepath.Ext(path))
    switch ext {
    case ".jpg", ".jpeg":
        return "image/jpeg"
    case ".png":
        return "image/png"
    case ".gif":
        return "image/gif"
    case ".webp":
        return "image/webp"
    default:
        return "image/png"
    }
}

func parseIssues(text string) []Issue {
    var issues []Issue
    lines := strings.Split(text, "\n")

    for _, line := range lines {
        line = strings.TrimSpace(line)
        if strings.HasPrefix(line, "[") {
            // Parse [SEVERITY] format
            if idx := strings.Index(line, "]"); idx > 0 {
                severity := strings.ToLower(strings.Trim(line[1:idx], " "))
                description := strings.TrimSpace(line[idx+1:])

                var suggestion string
                if dashIdx := strings.Index(description, " - "); dashIdx > 0 {
                    suggestion = strings.TrimSpace(description[dashIdx+3:])
                    description = strings.TrimSpace(description[:dashIdx])
                }

                issues = append(issues, Issue{
                    Severity:    severity,
                    Description: description,
                    Suggestion:  suggestion,
                })
            }
        }
    }

    return issues
}

// Using Claude for vision analysis
import "github.com/petal-labs/iris/providers/anthropic"

func NewClaudeAnalyzer() (*ImageAnalyzer, error) {
    provider, err := anthropic.NewFromKeystore()
    if err != nil {
        return nil, err
    }

    client := core.NewClient(provider)

    return &ImageAnalyzer{
        client: client,
        model:  "claude-sonnet-4-20250514",
    }, nil
}

// Claude supports PDFs natively via the Files API
func (a *ImageAnalyzer) AnalyzePDF(ctx context.Context, pdfPath, prompt string) (*AnalysisResult, error) {
    data, err := os.ReadFile(pdfPath)
    if err != nil {
        return nil, err
    }

    base64Data := base64.StdEncoding.EncodeToString(data)

    resp, err := a.client.Chat(a.model).
        System("You are a document analysis expert.").
        UserMultimodal().
            Text(prompt).
            File(base64Data, "application/pdf").
            Done().
        GetResponse(ctx)

    if err != nil {
        return nil, err
    }

    return &AnalysisResult{
        Description: resp.Output,
    }, nil
}

// Using Gemini for vision with Google Search grounding
import "github.com/petal-labs/iris/providers/gemini"

func NewGeminiAnalyzer() (*ImageAnalyzer, error) {
    provider, err := gemini.NewFromKeystore()
    if err != nil {
        return nil, err
    }

    client := core.NewClient(provider)

    return &ImageAnalyzer{
        client: client,
        model:  "gemini-2.0-flash",
    }, nil
}

// Gemini supports video analysis
func (a *ImageAnalyzer) AnalyzeVideo(ctx context.Context, videoPath, prompt string) (*AnalysisResult, error) {
    data, err := os.ReadFile(videoPath)
    if err != nil {
        return nil, err
    }

    base64Data := base64.StdEncoding.EncodeToString(data)

    resp, err := a.client.Chat(a.model).
        System("You are a video analysis expert.").
        UserMultimodal().
            Text(prompt).
            Video(base64Data, "video/mp4").
            Done().
        GetResponse(ctx)

    if err != nil {
        return nil, err
    }

    return &AnalysisResult{
        Description: resp.Output,
    }, nil
}

Key Concepts

Image Input Methods

Iris supports multiple ways to include images:

// From URL
builder.UserMultimodal().
    Text("Describe this image").
    ImageURL("https://example.com/image.png").
    Done()

// From base64-encoded data
builder.UserMultimodal().
    Text("Describe this image").
    ImageBase64(base64Data, "image/png").
    Done()

// Multiple images
builder.UserMultimodal().
    Text("Compare these images").
    ImageURL("https://example.com/before.png").
    ImageURL("https://example.com/after.png").
    Done()

Provider-Specific Features

Different providers offer unique capabilities:

Provider	Feature	Method
OpenAI	High/low detail mode	`ImageURL(url, core.ImageDetailHigh)`
Claude	PDF processing	`File(data, "application/pdf")`
Gemini	Video analysis	`Video(data, "video/mp4")`
Gemini	Audio analysis	`Audio(data, "audio/mp3")`

Structured Data Extraction

Use JSON mode for reliable data extraction:

resp, err := client.Chat("gpt-4o").
    System("Extract data as JSON.").
    UserMultimodal().
        Text("Extract the invoice details.").
        ImageBase64(base64Data, "image/png").
        Done().
    JSONMode(true).
    GetResponse(ctx)

var data map[string]interface{}
json.Unmarshal([]byte(resp.Output), &data)

Production Patterns

Image Preprocessing

Optimize images before sending:

import "image"
import "image/jpeg"

func preprocessImage(path string, maxSize int) ([]byte, error) {
    file, err := os.Open(path)
    if err != nil {
        return nil, err
    }
    defer file.Close()

    img, _, err := image.Decode(file)
    if err != nil {
        return nil, err
    }

    // Resize if needed
    bounds := img.Bounds()
    if bounds.Dx() > maxSize || bounds.Dy() > maxSize {
        img = resize(img, maxSize)
    }

    // Encode as JPEG for smaller size
    var buf bytes.Buffer
    if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: 85}); err != nil {
        return nil, err
    }

    return buf.Bytes(), nil
}

Batch Processing

Process multiple images efficiently:

func (a *ImageAnalyzer) BatchAnalyze(ctx context.Context, paths []string, prompt string) ([]AnalysisResult, error) {
    results := make([]AnalysisResult, len(paths))
    errors := make([]error, len(paths))

    var wg sync.WaitGroup
    sem := make(chan struct{}, 5) // Limit concurrency

    for i, path := range paths {
        wg.Add(1)
        go func(idx int, p string) {
            defer wg.Done()
            sem <- struct{}{}
            defer func() { <-sem }()

            result, err := a.AnalyzeFile(ctx, p, prompt)
            if err != nil {
                errors[idx] = err
                return
            }
            results[idx] = *result
        }(i, path)
    }

    wg.Wait()

    // Check for errors
    for _, err := range errors {
        if err != nil {
            return results, fmt.Errorf("batch had errors: %w", err)
        }
    }

    return results, nil
}

Caching Results

Cache analysis to avoid redundant API calls:

type AnalysisCache struct {
    cache map[string]*AnalysisResult
    mu    sync.RWMutex
}

func (c *AnalysisCache) GetOrAnalyze(ctx context.Context, analyzer *ImageAnalyzer, path, prompt string) (*AnalysisResult, error) {
    // Create cache key from file hash + prompt
    key := hashFile(path) + ":" + hashString(prompt)

    c.mu.RLock()
    if result, ok := c.cache[key]; ok {
        c.mu.RUnlock()
        return result, nil
    }
    c.mu.RUnlock()

    result, err := analyzer.AnalyzeFile(ctx, path, prompt)
    if err != nil {
        return nil, err
    }

    c.mu.Lock()
    c.cache[key] = result
    c.mu.Unlock()

    return result, nil
}

Error Handling

Handle image-specific errors:

func (a *ImageAnalyzer) AnalyzeWithRetry(ctx context.Context, path, prompt string) (*AnalysisResult, error) {
    var lastErr error

    for attempt := 0; attempt < 3; attempt++ {
        result, err := a.AnalyzeFile(ctx, path, prompt)
        if err == nil {
            return result, nil
        }

        lastErr = err

        // Check for specific errors
        var apiErr *core.APIError
        if errors.As(err, &apiErr) {
            switch apiErr.StatusCode {
            case 400:
                // Bad request - likely invalid image
                return nil, fmt.Errorf("invalid image: %w", err)
            case 413:
                // Image too large
                return nil, fmt.Errorf("image too large (max 20MB): %w", err)
            case 429:
                // Rate limited - wait and retry
                time.Sleep(time.Duration(attempt+1) * 5 * time.Second)
                continue
            }
        }

        // Unknown error - retry with backoff
        time.Sleep(time.Duration(attempt+1) * time.Second)
    }

    return nil, fmt.Errorf("analysis failed after retries: %w", lastErr)
}

Real-World Examples

Architecture Review

result, err := analyzer.AnalyzeURL(ctx,
    "https://example.com/architecture.png",
    `Analyze this system architecture diagram:
1. Identify all components and their roles
2. Trace the data flow from user to database
3. Identify single points of failure
4. Suggest improvements for scalability
Format as: Component Analysis, Data Flow, Risks, Recommendations`,
)

UI Accessibility Audit

result, err := analyzer.AnalyzeFile(ctx,
    "./screenshots/login-page.png",
    `Perform an accessibility audit:
1. Check color contrast ratios
2. Identify missing labels or alt text indicators
3. Evaluate touch target sizes
4. Check for clear visual hierarchy
Format issues as: [WCAG-LEVEL] Issue - Recommendation`,
)

Receipt Data Extraction

schema := ExtractionSchema{
    Fields: []SchemaField{
        {Name: "store_name", Type: "string", Description: "Name of the store"},
        {Name: "date", Type: "string", Description: "Transaction date"},
        {Name: "items", Type: "array", Description: "List of items with name, qty, price"},
        {Name: "subtotal", Type: "number", Description: "Subtotal before tax"},
        {Name: "tax", Type: "number", Description: "Tax amount"},
        {Name: "total", Type: "number", Description: "Total amount"},
        {Name: "payment_method", Type: "string", Description: "How payment was made"},
    },
}

data, err := analyzer.ExtractData(ctx, "./receipts/grocery.jpg", schema)

Example Output

=== Architecture Diagram Analysis ===
The diagram shows a three-tier web application with the following components:

**Components:**
- Load Balancer (nginx) - Entry point, distributes traffic
- API Servers (3x) - Stateless application logic
- Redis Cache - Session storage and caching
- PostgreSQL Primary - Main database
- PostgreSQL Replica - Read replica

**Data Flow:**
User → Load Balancer → API Server → Cache/Database

**Potential Bottlenecks:**
1. Single Redis instance - no failover
2. Database writes concentrated on primary
3. No CDN for static assets

**Recommendations:**
- Add Redis Sentinel for cache HA
- Consider read replicas for heavy read operations
- Implement connection pooling (PgBouncer)

=== Screenshot Analysis ===
Dashboard UI review completed.

- [HIGH] Low contrast text - White text on light gray background fails WCAG AA. Use #333 on #fff.
- [MEDIUM] Small touch targets - Filter buttons are 32px. Increase to 44px minimum.
- [LOW] Missing loading states - Charts show no skeleton loaders.

Best Practices

Practice	Recommendation
Image size	Resize to max 2048px for faster processing
Format	Use JPEG for photos, PNG for diagrams
Prompts	Be specific about what to analyze
Temperature	Use 0.0-0.3 for factual extraction
JSON mode	Enable for structured data extraction
Batching	Limit concurrent requests to 5-10

Next Steps

OpenAI Vision

GPT-4o vision capabilities. OpenAI →

Anthropic Files

Claude PDF processing. Anthropic →