OpenAI Vision
GPT-4o vision capabilities. OpenAI →
This example demonstrates how to build applications that combine text and images for richer AI interactions. From analyzing architecture diagrams to processing screenshots for automated testing, multimodal capabilities unlock new use cases.
A visual analysis system that:
| Use Case | Description |
|---|---|
| Architecture review | Analyze system diagrams for bottlenecks |
| UI/UX feedback | Review mockups and screenshots |
| Document processing | Extract data from forms, receipts, invoices |
| Chart analysis | Interpret graphs and visualizations |
| Accessibility audit | Identify issues in UI screenshots |
# Install dependenciesgo get github.com/petal-labs/iris
# Set up API key (choose your provider)iris keys set openai # GPT-4oiris keys set anthropic # Claudeiris keys set gemini # Gemini Pro Visionpackage main
import ( "context" "encoding/base64" "encoding/json" "fmt" "io" "log" "net/http" "os" "path/filepath" "strings" "time"
"github.com/petal-labs/iris/core" "github.com/petal-labs/iris/providers/openai")
// ImageAnalyzer provides visual analysis capabilitiestype ImageAnalyzer struct { client *core.Client model string}
// AnalysisResult contains structured analysis outputtype AnalysisResult struct { Description string `json:"description"` Elements []VisualElement `json:"elements,omitempty"` Issues []Issue `json:"issues,omitempty"` Metadata map[string]string `json:"metadata,omitempty"`}
// VisualElement represents a detected componenttype VisualElement struct { Type string `json:"type"` Description string `json:"description"` Location string `json:"location,omitempty"`}
// Issue represents a detected problemtype Issue struct { Severity string `json:"severity"` Description string `json:"description"` Suggestion string `json:"suggestion,omitempty"`}
func main() { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel()
analyzer, err := NewImageAnalyzer() if err != nil { log.Fatal(err) }
// Example 1: Analyze an architecture diagram from URL fmt.Println("=== Architecture Diagram Analysis ===") result, err := analyzer.AnalyzeURL(ctx, "https://example.com/architecture-diagram.png", "Analyze this system architecture. Identify components, data flows, and potential bottlenecks.", ) if err != nil { log.Printf("URL analysis error: %v", err) } else { fmt.Println(result.Description) }
// Example 2: Analyze a local file fmt.Println("\n=== Screenshot Analysis ===") result, err = analyzer.AnalyzeFile(ctx, "./screenshots/dashboard.png", "Review this dashboard UI. Check for usability issues and suggest improvements.", ) if err != nil { log.Printf("File analysis error: %v", err) } else { fmt.Println(result.Description) for _, issue := range result.Issues { fmt.Printf("- [%s] %s\n", issue.Severity, issue.Description) } }
// Example 3: Compare multiple images fmt.Println("\n=== Design Comparison ===") result, err = analyzer.CompareImages(ctx, []string{"./mockups/v1.png", "./mockups/v2.png"}, "Compare these two design versions. What changed? Which is better for usability?", ) if err != nil { log.Printf("Comparison error: %v", err) } else { fmt.Println(result.Description) }
// Example 4: Extract structured data fmt.Println("\n=== Data Extraction ===") data, err := analyzer.ExtractData(ctx, "./documents/invoice.png", InvoiceSchema, ) if err != nil { log.Printf("Extraction error: %v", err) } else { fmt.Printf("Invoice Data: %+v\n", data) }}
// NewImageAnalyzer creates an analyzer with the configured providerfunc NewImageAnalyzer() (*ImageAnalyzer, error) { provider, err := openai.NewFromKeystore() if err != nil { provider, err = openai.NewFromEnv() if err != nil { return nil, fmt.Errorf("openai not configured: %w", err) } }
client := core.NewClient(provider, core.WithRetryPolicy(&core.RetryPolicy{ MaxRetries: 3, InitialInterval: 1 * time.Second, MaxInterval: 30 * time.Second, BackoffMultiplier: 2.0, RetryOn: []int{429, 500, 503}, }), )
return &ImageAnalyzer{ client: client, model: "gpt-4o", }, nil}
// AnalyzeURL analyzes an image from a URLfunc (a *ImageAnalyzer) AnalyzeURL(ctx context.Context, imageURL, prompt string) (*AnalysisResult, error) { resp, err := a.client.Chat(a.model). System("You are an expert visual analyst. Provide detailed, actionable analysis."). UserMultimodal(). Text(prompt). ImageURL(imageURL). Done(). Temperature(0.3). GetResponse(ctx)
if err != nil { return nil, fmt.Errorf("analysis failed: %w", err) }
return &AnalysisResult{ Description: resp.Output, }, nil}
// AnalyzeFile analyzes an image from a local filefunc (a *ImageAnalyzer) AnalyzeFile(ctx context.Context, filePath, prompt string) (*AnalysisResult, error) { // Read and encode the file data, err := os.ReadFile(filePath) if err != nil { return nil, fmt.Errorf("reading file: %w", err) }
base64Data := base64.StdEncoding.EncodeToString(data) mimeType := getMimeType(filePath)
resp, err := a.client.Chat(a.model). System(`You are an expert visual analyst. When analyzing images:1. Describe what you see in detail2. Identify any issues or problems3. Provide specific, actionable suggestionsFormat issues as: [SEVERITY] Description - Suggestion`). UserMultimodal(). Text(prompt). ImageBase64(base64Data, mimeType). Done(). Temperature(0.3). GetResponse(ctx)
if err != nil { return nil, fmt.Errorf("analysis failed: %w", err) }
// Parse issues from the response issues := parseIssues(resp.Output)
return &AnalysisResult{ Description: resp.Output, Issues: issues, }, nil}
// CompareImages analyzes multiple images togetherfunc (a *ImageAnalyzer) CompareImages(ctx context.Context, filePaths []string, prompt string) (*AnalysisResult, error) { builder := a.client.Chat(a.model). System("You are an expert at comparing visual designs. Analyze differences and provide recommendations."). UserMultimodal(). Text(prompt)
// Add each image for i, path := range filePaths { data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("reading file %d: %w", i, err) }
base64Data := base64.StdEncoding.EncodeToString(data) mimeType := getMimeType(path) builder = builder.ImageBase64(base64Data, mimeType) }
resp, err := builder.Done(). Temperature(0.3). GetResponse(ctx)
if err != nil { return nil, fmt.Errorf("comparison failed: %w", err) }
return &AnalysisResult{ Description: resp.Output, }, nil}
// Schema for structured data extractiontype ExtractionSchema struct { Fields []SchemaField `json:"fields"`}
type SchemaField struct { Name string `json:"name"` Type string `json:"type"` Description string `json:"description"`}
var InvoiceSchema = ExtractionSchema{ Fields: []SchemaField{ {Name: "vendor", Type: "string", Description: "Company or person who issued the invoice"}, {Name: "invoice_number", Type: "string", Description: "Invoice number or ID"}, {Name: "date", Type: "string", Description: "Invoice date in ISO format"}, {Name: "total", Type: "number", Description: "Total amount"}, {Name: "currency", Type: "string", Description: "Currency code (USD, EUR, etc.)"}, {Name: "items", Type: "array", Description: "Line items with description and amount"}, },}
// ExtractData extracts structured data from an imagefunc (a *ImageAnalyzer) ExtractData(ctx context.Context, filePath string, schema ExtractionSchema) (map[string]interface{}, error) { data, err := os.ReadFile(filePath) if err != nil { return nil, fmt.Errorf("reading file: %w", err) }
base64Data := base64.StdEncoding.EncodeToString(data) mimeType := getMimeType(filePath)
// Build extraction prompt var fieldDescs []string for _, f := range schema.Fields { fieldDescs = append(fieldDescs, fmt.Sprintf("- %s (%s): %s", f.Name, f.Type, f.Description)) }
prompt := fmt.Sprintf(`Extract the following information from this document:%s
Return the data as a JSON object with these exact field names.If a field cannot be determined, use null.`, strings.Join(fieldDescs, "\n"))
resp, err := a.client.Chat(a.model). System("You are a data extraction specialist. Extract structured data from documents accurately. Always respond with valid JSON."). UserMultimodal(). Text(prompt). ImageBase64(base64Data, mimeType). Done(). Temperature(0.0). JSONMode(true). GetResponse(ctx)
if err != nil { return nil, fmt.Errorf("extraction failed: %w", err) }
// Parse JSON response var result map[string]interface{} if err := json.Unmarshal([]byte(resp.Output), &result); err != nil { return nil, fmt.Errorf("parsing response: %w", err) }
return result, nil}
// Helper functions
func getMimeType(path string) string { ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".jpg", ".jpeg": return "image/jpeg" case ".png": return "image/png" case ".gif": return "image/gif" case ".webp": return "image/webp" default: return "image/png" }}
func parseIssues(text string) []Issue { var issues []Issue lines := strings.Split(text, "\n")
for _, line := range lines { line = strings.TrimSpace(line) if strings.HasPrefix(line, "[") { // Parse [SEVERITY] format if idx := strings.Index(line, "]"); idx > 0 { severity := strings.ToLower(strings.Trim(line[1:idx], " ")) description := strings.TrimSpace(line[idx+1:])
var suggestion string if dashIdx := strings.Index(description, " - "); dashIdx > 0 { suggestion = strings.TrimSpace(description[dashIdx+3:]) description = strings.TrimSpace(description[:dashIdx]) }
issues = append(issues, Issue{ Severity: severity, Description: description, Suggestion: suggestion, }) } } }
return issues}// Using Claude for vision analysisimport "github.com/petal-labs/iris/providers/anthropic"
func NewClaudeAnalyzer() (*ImageAnalyzer, error) { provider, err := anthropic.NewFromKeystore() if err != nil { return nil, err }
client := core.NewClient(provider)
return &ImageAnalyzer{ client: client, model: "claude-sonnet-4-20250514", }, nil}
// Claude supports PDFs natively via the Files APIfunc (a *ImageAnalyzer) AnalyzePDF(ctx context.Context, pdfPath, prompt string) (*AnalysisResult, error) { data, err := os.ReadFile(pdfPath) if err != nil { return nil, err }
base64Data := base64.StdEncoding.EncodeToString(data)
resp, err := a.client.Chat(a.model). System("You are a document analysis expert."). UserMultimodal(). Text(prompt). File(base64Data, "application/pdf"). Done(). GetResponse(ctx)
if err != nil { return nil, err }
return &AnalysisResult{ Description: resp.Output, }, nil}// Using Gemini for vision with Google Search groundingimport "github.com/petal-labs/iris/providers/gemini"
func NewGeminiAnalyzer() (*ImageAnalyzer, error) { provider, err := gemini.NewFromKeystore() if err != nil { return nil, err }
client := core.NewClient(provider)
return &ImageAnalyzer{ client: client, model: "gemini-2.0-flash", }, nil}
// Gemini supports video analysisfunc (a *ImageAnalyzer) AnalyzeVideo(ctx context.Context, videoPath, prompt string) (*AnalysisResult, error) { data, err := os.ReadFile(videoPath) if err != nil { return nil, err }
base64Data := base64.StdEncoding.EncodeToString(data)
resp, err := a.client.Chat(a.model). System("You are a video analysis expert."). UserMultimodal(). Text(prompt). Video(base64Data, "video/mp4"). Done(). GetResponse(ctx)
if err != nil { return nil, err }
return &AnalysisResult{ Description: resp.Output, }, nil}Iris supports multiple ways to include images:
// From URLbuilder.UserMultimodal(). Text("Describe this image"). ImageURL("https://example.com/image.png"). Done()
// From base64-encoded databuilder.UserMultimodal(). Text("Describe this image"). ImageBase64(base64Data, "image/png"). Done()
// Multiple imagesbuilder.UserMultimodal(). Text("Compare these images"). ImageURL("https://example.com/before.png"). ImageURL("https://example.com/after.png"). Done()Different providers offer unique capabilities:
| Provider | Feature | Method |
|---|---|---|
| OpenAI | High/low detail mode | ImageURL(url, core.ImageDetailHigh) |
| Claude | PDF processing | File(data, "application/pdf") |
| Gemini | Video analysis | Video(data, "video/mp4") |
| Gemini | Audio analysis | Audio(data, "audio/mp3") |
Use JSON mode for reliable data extraction:
resp, err := client.Chat("gpt-4o"). System("Extract data as JSON."). UserMultimodal(). Text("Extract the invoice details."). ImageBase64(base64Data, "image/png"). Done(). JSONMode(true). GetResponse(ctx)
var data map[string]interface{}json.Unmarshal([]byte(resp.Output), &data)Optimize images before sending:
import "image"import "image/jpeg"
func preprocessImage(path string, maxSize int) ([]byte, error) { file, err := os.Open(path) if err != nil { return nil, err } defer file.Close()
img, _, err := image.Decode(file) if err != nil { return nil, err }
// Resize if needed bounds := img.Bounds() if bounds.Dx() > maxSize || bounds.Dy() > maxSize { img = resize(img, maxSize) }
// Encode as JPEG for smaller size var buf bytes.Buffer if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: 85}); err != nil { return nil, err }
return buf.Bytes(), nil}Process multiple images efficiently:
func (a *ImageAnalyzer) BatchAnalyze(ctx context.Context, paths []string, prompt string) ([]AnalysisResult, error) { results := make([]AnalysisResult, len(paths)) errors := make([]error, len(paths))
var wg sync.WaitGroup sem := make(chan struct{}, 5) // Limit concurrency
for i, path := range paths { wg.Add(1) go func(idx int, p string) { defer wg.Done() sem <- struct{}{} defer func() { <-sem }()
result, err := a.AnalyzeFile(ctx, p, prompt) if err != nil { errors[idx] = err return } results[idx] = *result }(i, path) }
wg.Wait()
// Check for errors for _, err := range errors { if err != nil { return results, fmt.Errorf("batch had errors: %w", err) } }
return results, nil}Cache analysis to avoid redundant API calls:
type AnalysisCache struct { cache map[string]*AnalysisResult mu sync.RWMutex}
func (c *AnalysisCache) GetOrAnalyze(ctx context.Context, analyzer *ImageAnalyzer, path, prompt string) (*AnalysisResult, error) { // Create cache key from file hash + prompt key := hashFile(path) + ":" + hashString(prompt)
c.mu.RLock() if result, ok := c.cache[key]; ok { c.mu.RUnlock() return result, nil } c.mu.RUnlock()
result, err := analyzer.AnalyzeFile(ctx, path, prompt) if err != nil { return nil, err }
c.mu.Lock() c.cache[key] = result c.mu.Unlock()
return result, nil}Handle image-specific errors:
func (a *ImageAnalyzer) AnalyzeWithRetry(ctx context.Context, path, prompt string) (*AnalysisResult, error) { var lastErr error
for attempt := 0; attempt < 3; attempt++ { result, err := a.AnalyzeFile(ctx, path, prompt) if err == nil { return result, nil }
lastErr = err
// Check for specific errors var apiErr *core.APIError if errors.As(err, &apiErr) { switch apiErr.StatusCode { case 400: // Bad request - likely invalid image return nil, fmt.Errorf("invalid image: %w", err) case 413: // Image too large return nil, fmt.Errorf("image too large (max 20MB): %w", err) case 429: // Rate limited - wait and retry time.Sleep(time.Duration(attempt+1) * 5 * time.Second) continue } }
// Unknown error - retry with backoff time.Sleep(time.Duration(attempt+1) * time.Second) }
return nil, fmt.Errorf("analysis failed after retries: %w", lastErr)}result, err := analyzer.AnalyzeURL(ctx, "https://example.com/architecture.png", `Analyze this system architecture diagram:1. Identify all components and their roles2. Trace the data flow from user to database3. Identify single points of failure4. Suggest improvements for scalabilityFormat as: Component Analysis, Data Flow, Risks, Recommendations`,)result, err := analyzer.AnalyzeFile(ctx, "./screenshots/login-page.png", `Perform an accessibility audit:1. Check color contrast ratios2. Identify missing labels or alt text indicators3. Evaluate touch target sizes4. Check for clear visual hierarchyFormat issues as: [WCAG-LEVEL] Issue - Recommendation`,)schema := ExtractionSchema{ Fields: []SchemaField{ {Name: "store_name", Type: "string", Description: "Name of the store"}, {Name: "date", Type: "string", Description: "Transaction date"}, {Name: "items", Type: "array", Description: "List of items with name, qty, price"}, {Name: "subtotal", Type: "number", Description: "Subtotal before tax"}, {Name: "tax", Type: "number", Description: "Tax amount"}, {Name: "total", Type: "number", Description: "Total amount"}, {Name: "payment_method", Type: "string", Description: "How payment was made"}, },}
data, err := analyzer.ExtractData(ctx, "./receipts/grocery.jpg", schema)=== Architecture Diagram Analysis ===The diagram shows a three-tier web application with the following components:
**Components:**- Load Balancer (nginx) - Entry point, distributes traffic- API Servers (3x) - Stateless application logic- Redis Cache - Session storage and caching- PostgreSQL Primary - Main database- PostgreSQL Replica - Read replica
**Data Flow:**User → Load Balancer → API Server → Cache/Database
**Potential Bottlenecks:**1. Single Redis instance - no failover2. Database writes concentrated on primary3. No CDN for static assets
**Recommendations:**- Add Redis Sentinel for cache HA- Consider read replicas for heavy read operations- Implement connection pooling (PgBouncer)
=== Screenshot Analysis ===Dashboard UI review completed.
- [HIGH] Low contrast text - White text on light gray background fails WCAG AA. Use #333 on #fff.- [MEDIUM] Small touch targets - Filter buttons are 32px. Increase to 44px minimum.- [LOW] Missing loading states - Charts show no skeleton loaders.| Practice | Recommendation |
|---|---|
| Image size | Resize to max 2048px for faster processing |
| Format | Use JPEG for photos, PNG for diagrams |
| Prompts | Be specific about what to analyze |
| Temperature | Use 0.0-0.3 for factual extraction |
| JSON mode | Enable for structured data extraction |
| Batching | Limit concurrent requests to 5-10 |
OpenAI Vision
GPT-4o vision capabilities. OpenAI →
Anthropic Files
Claude PDF processing. Anthropic →