Skip to content

Testing Utilities

Iris provides testing utilities that make it easy to write deterministic, fast, and reliable tests for code that uses LLM providers. The testing package includes MockProvider for controlled responses and RecordingProvider for capturing real interactions.

  • Deterministic tests: Same input always produces same output
  • Fast execution: No network latency or API rate limits
  • Cost-free: No API charges during test runs
  • Offline capability: Tests run without internet connection
  • Edge case testing: Simulate errors, timeouts, and unusual responses

MockProvider returns predefined responses, allowing you to control exactly what your code receives from the LLM layer.

package myapp_test
import (
"context"
"testing"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/testing"
)
func TestChatHandler(t *testing.T) {
// Create mock with a single response
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{
ID: "test-response",
Model: "mock-model",
Output: "Hello! I'm a mock response.",
Usage: core.TokenUsage{TotalTokens: 10},
})
// Create client with mock provider
client := core.NewClient(mock)
// Your code under test
resp, err := client.Chat("any-model").
User("Hello!").
GetResponse(context.Background())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if resp.Output != "Hello! I'm a mock response." {
t.Errorf("unexpected output: %s", resp.Output)
}
}

Queue multiple responses for multi-turn conversations or sequential tests:

func TestMultiTurnConversation(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{
ID: "resp-1",
Output: "I'm the first response.",
}).
WithResponse(core.ChatResponse{
ID: "resp-2",
Output: "I'm the second response.",
}).
WithDefaultResponse(core.ChatResponse{
ID: "default",
Output: "I'm the default response.",
})
client := core.NewClient(mock)
ctx := context.Background()
// First call gets first queued response
resp1, _ := client.Chat("model").User("First").GetResponse(ctx)
if resp1.Output != "I'm the first response." {
t.Errorf("wrong first response: %s", resp1.Output)
}
// Second call gets second queued response
resp2, _ := client.Chat("model").User("Second").GetResponse(ctx)
if resp2.Output != "I'm the second response." {
t.Errorf("wrong second response: %s", resp2.Output)
}
// Third call (queue exhausted) gets default response
resp3, _ := client.Chat("model").User("Third").GetResponse(ctx)
if resp3.Output != "I'm the default response." {
t.Errorf("wrong default response: %s", resp3.Output)
}
}

Test error handling by injecting specific errors:

func TestRateLimitHandling(t *testing.T) {
mock := testing.NewMockProvider().
WithError(core.ErrRateLimited)
client := core.NewClient(mock)
_, err := client.Chat("model").
User("This will fail").
GetResponse(context.Background())
if !errors.Is(err, core.ErrRateLimited) {
t.Errorf("expected rate limit error, got: %v", err)
}
}
func TestAuthenticationError(t *testing.T) {
mock := testing.NewMockProvider().
WithError(core.ErrUnauthorized)
client := core.NewClient(mock)
_, err := client.Chat("model").
User("This will fail").
GetResponse(context.Background())
if !errors.Is(err, core.ErrUnauthorized) {
t.Errorf("expected auth error, got: %v", err)
}
}

Mock streaming responses with controlled chunks:

func TestStreamingHandler(t *testing.T) {
mock := testing.NewMockProvider().
WithStreamingResponse(
[]string{"Hello", " ", "world", "!"},
&core.ChatResponse{
ID: "stream-resp",
Model: "mock-model",
Output: "Hello world!",
Usage: core.TokenUsage{TotalTokens: 5},
},
)
client := core.NewClient(mock)
stream, err := client.Chat("model").
User("Stream something").
Stream(context.Background())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var chunks []string
for chunk := range stream.Ch {
chunks = append(chunks, chunk.Delta)
}
if err := <-stream.Err; err != nil {
t.Fatalf("stream error: %v", err)
}
expected := []string{"Hello", " ", "world", "!"}
if !reflect.DeepEqual(chunks, expected) {
t.Errorf("chunks = %v, want %v", chunks, expected)
}
}

Verify that your code makes the expected requests:

func TestRequestInspection(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{Output: "OK"})
client := core.NewClient(mock)
ctx := context.Background()
// Make some calls
client.Chat("gpt-4o").
System("You are helpful.").
User("Hello").
Temperature(0.7).
GetResponse(ctx)
client.Chat("gpt-4o").
User("Goodbye").
GetResponse(ctx)
// Inspect recorded calls
calls := mock.Calls()
if len(calls) != 2 {
t.Fatalf("expected 2 calls, got %d", len(calls))
}
// Check first call
first := calls[0]
if first.Request.Model != "gpt-4o" {
t.Errorf("wrong model: %s", first.Request.Model)
}
if len(first.Request.Messages) != 2 {
t.Errorf("expected 2 messages, got %d", len(first.Request.Messages))
}
// Check system message
if first.Request.Messages[0].Role != core.RoleSystem {
t.Error("first message should be system")
}
}

RecordingProvider wraps a real provider and records all interactions. This is useful for:

  • Debugging production issues
  • Creating test fixtures from real responses
  • Analyzing actual API usage patterns
package main
import (
"context"
"fmt"
"os"
"time"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/openai"
"github.com/petal-labs/iris/testing"
)
func main() {
// Wrap real provider with recorder
realProvider := openai.New(os.Getenv("OPENAI_API_KEY"))
recorder := testing.NewRecordingProvider(realProvider)
client := core.NewClient(recorder)
ctx := context.Background()
// Make API calls as normal
_, _ = client.Chat("gpt-4o-mini").
User("What is the capital of France?").
GetResponse(ctx)
_, _ = client.Chat("gpt-4o-mini").
User("And what about Germany?").
GetResponse(ctx)
// Inspect recorded interactions
for i, rec := range recorder.Recordings() {
fmt.Printf("Call %d:\n", i+1)
fmt.Printf(" Method: %s\n", rec.Method)
fmt.Printf(" Duration: %v\n", rec.Duration)
fmt.Printf(" Model: %s\n", rec.Request.Model)
if rec.Response != nil {
fmt.Printf(" Output: %s\n", truncate(rec.Response.Output, 50))
fmt.Printf(" Tokens: %d\n", rec.Response.Usage.TotalTokens)
}
if rec.Error != nil {
fmt.Printf(" Error: %v\n", rec.Error)
}
}
}

Use recordings to verify API interactions in integration tests:

func TestAPIIntegration(t *testing.T) {
if os.Getenv("OPENAI_API_KEY") == "" {
t.Skip("OPENAI_API_KEY not set")
}
provider := openai.New(os.Getenv("OPENAI_API_KEY"))
recorder := testing.NewRecordingProvider(provider)
client := core.NewClient(recorder)
// Run your code
result := myApp.ProcessQuery(client, "test query")
// Verify recordings
recordings := recorder.Recordings()
if len(recordings) == 0 {
t.Error("expected at least one API call")
}
// Check that we used the right model
for _, rec := range recordings {
if rec.Request.Model != "gpt-4o-mini" {
t.Errorf("expected gpt-4o-mini, got %s", rec.Request.Model)
}
}
// Verify no errors
for i, rec := range recordings {
if rec.Error != nil {
t.Errorf("call %d failed: %v", i, rec.Error)
}
}
}

Clear recordings between test cases:

func TestMultipleScenarios(t *testing.T) {
provider := testing.NewMockProvider().
WithDefaultResponse(core.ChatResponse{Output: "OK"})
recorder := testing.NewRecordingProvider(provider)
client := core.NewClient(recorder)
t.Run("scenario1", func(t *testing.T) {
recorder.Clear() // Start fresh
// Test scenario 1
client.Chat("model").User("Test 1").GetResponse(context.Background())
if len(recorder.Recordings()) != 1 {
t.Error("expected 1 recording")
}
})
t.Run("scenario2", func(t *testing.T) {
recorder.Clear() // Start fresh
// Test scenario 2
client.Chat("model").User("Test 2a").GetResponse(context.Background())
client.Chat("model").User("Test 2b").GetResponse(context.Background())
if len(recorder.Recordings()) != 2 {
t.Error("expected 2 recordings")
}
})
}
func TestPromptVariations(t *testing.T) {
tests := []struct {
name string
prompt string
response string
wantErr bool
}{
{
name: "greeting",
prompt: "Hello",
response: "Hi there!",
},
{
name: "farewell",
prompt: "Goodbye",
response: "See you later!",
},
{
name: "error case",
prompt: "trigger error",
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var mock *testing.MockProvider
if tt.wantErr {
mock = testing.NewMockProvider().
WithError(errors.New("simulated error"))
} else {
mock = testing.NewMockProvider().
WithResponse(core.ChatResponse{Output: tt.response})
}
client := core.NewClient(mock)
resp, err := client.Chat("model").
User(tt.prompt).
GetResponse(context.Background())
if tt.wantErr {
if err == nil {
t.Error("expected error, got nil")
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if resp.Output != tt.response {
t.Errorf("output = %q, want %q", resp.Output, tt.response)
}
})
}
}

Structure your code for testability:

// production code
type ChatService struct {
client *core.Client
}
func NewChatService(provider core.Provider) *ChatService {
return &ChatService{
client: core.NewClient(provider),
}
}
func (s *ChatService) Summarize(ctx context.Context, text string) (string, error) {
resp, err := s.client.Chat("gpt-4o-mini").
System("Summarize the following text in one sentence.").
User(text).
GetResponse(ctx)
if err != nil {
return "", err
}
return resp.Output, nil
}
// test code
func TestSummarize(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{
Output: "This is a summary.",
})
service := NewChatService(mock)
result, err := service.Summarize(context.Background(), "Long text...")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result != "This is a summary." {
t.Errorf("unexpected result: %s", result)
}
// Verify the request
calls := mock.Calls()
if calls[0].Request.Messages[0].Content != "Summarize the following text in one sentence." {
t.Error("wrong system prompt")
}
}
func TestToolCallingFlow(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{
ID: "resp-1",
Output: "",
ToolCalls: []core.ToolCall{
{
ID: "call-1",
Name: "get_weather",
Arguments: json.RawMessage(`{"location": "Tokyo"}`),
},
},
}).
WithResponse(core.ChatResponse{
ID: "resp-2",
Output: "The weather in Tokyo is sunny and 22°C.",
})
client := core.NewClient(mock)
ctx := context.Background()
// First call - should get tool call
resp1, _ := client.Chat("gpt-4o").
User("What's the weather in Tokyo?").
Tools(weatherTool).
GetResponse(ctx)
if len(resp1.ToolCalls) != 1 {
t.Fatalf("expected 1 tool call, got %d", len(resp1.ToolCalls))
}
// Second call - with tool result
resp2, _ := client.Chat("gpt-4o").
User("What's the weather in Tokyo?").
Tools(weatherTool).
ToolResults(core.ToolResult{
CallID: "call-1",
Content: `{"temperature": 22, "condition": "sunny"}`,
}).
GetResponse(ctx)
if resp2.Output == "" {
t.Error("expected final response with output")
}
}
// Unit test - fast, deterministic
func TestBusinessLogic(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{Output: "expected"})
// Test your logic
}
// Integration test - verifies real API behavior
func TestIntegration(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
provider := openai.New(os.Getenv("OPENAI_API_KEY"))
recorder := testing.NewRecordingProvider(provider)
// Test with real API
}
func TestRequestConstruction(t *testing.T) {
mock := testing.NewMockProvider().
WithResponse(core.ChatResponse{Output: "OK"})
client := core.NewClient(mock)
myFunction(client)
calls := mock.Calls()
req := calls[0].Request
// Verify request was constructed correctly
assert.Equal(t, "gpt-4o", req.Model)
assert.Equal(t, 0.7, *req.Temperature)
assert.Len(t, req.Messages, 2)
}
func TestErrorRecovery(t *testing.T) {
// First call fails, second succeeds
mock := testing.NewMockProvider().
WithError(core.ErrRateLimited).
WithResponse(core.ChatResponse{Output: "Success"})
// Your retry logic should handle this
result := myAppWithRetry(core.NewClient(mock))
if result != "Success" {
t.Error("retry should have succeeded")
}
}

Tools Guide

Test tool calling flows. Tools →

API Reference

Full testing package API. API →