Testing Utilities

Iris provides testing utilities that make it easy to write deterministic, fast, and reliable tests for code that uses LLM providers. The testing package includes MockProvider for controlled responses and RecordingProvider for capturing real interactions.

Why Mock LLM Calls?

Deterministic tests: Same input always produces same output
Fast execution: No network latency or API rate limits
Cost-free: No API charges during test runs
Offline capability: Tests run without internet connection
Edge case testing: Simulate errors, timeouts, and unusual responses

MockProvider

MockProvider returns predefined responses, allowing you to control exactly what your code receives from the LLM layer.

Basic Usage

package myapp_test

import (
    "context"
    "testing"

    "github.com/petal-labs/iris/core"
    "github.com/petal-labs/iris/testing"
)

func TestChatHandler(t *testing.T) {
    // Create mock with a single response
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{
            ID:     "test-response",
            Model:  "mock-model",
            Output: "Hello! I'm a mock response.",
            Usage:  core.TokenUsage{TotalTokens: 10},
        })

    // Create client with mock provider
    client := core.NewClient(mock)

    // Your code under test
    resp, err := client.Chat("any-model").
        User("Hello!").
        GetResponse(context.Background())

    if err != nil {
        t.Fatalf("unexpected error: %v", err)
    }

    if resp.Output != "Hello! I'm a mock response." {
        t.Errorf("unexpected output: %s", resp.Output)
    }
}

Response Queue

Queue multiple responses for multi-turn conversations or sequential tests:

func TestMultiTurnConversation(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{
            ID:     "resp-1",
            Output: "I'm the first response.",
        }).
        WithResponse(core.ChatResponse{
            ID:     "resp-2",
            Output: "I'm the second response.",
        }).
        WithDefaultResponse(core.ChatResponse{
            ID:     "default",
            Output: "I'm the default response.",
        })

    client := core.NewClient(mock)
    ctx := context.Background()

    // First call gets first queued response
    resp1, _ := client.Chat("model").User("First").GetResponse(ctx)
    if resp1.Output != "I'm the first response." {
        t.Errorf("wrong first response: %s", resp1.Output)
    }

    // Second call gets second queued response
    resp2, _ := client.Chat("model").User("Second").GetResponse(ctx)
    if resp2.Output != "I'm the second response." {
        t.Errorf("wrong second response: %s", resp2.Output)
    }

    // Third call (queue exhausted) gets default response
    resp3, _ := client.Chat("model").User("Third").GetResponse(ctx)
    if resp3.Output != "I'm the default response." {
        t.Errorf("wrong default response: %s", resp3.Output)
    }
}

Error Injection

Test error handling by injecting specific errors:

func TestRateLimitHandling(t *testing.T) {
    mock := testing.NewMockProvider().
        WithError(core.ErrRateLimited)

    client := core.NewClient(mock)

    _, err := client.Chat("model").
        User("This will fail").
        GetResponse(context.Background())

    if !errors.Is(err, core.ErrRateLimited) {
        t.Errorf("expected rate limit error, got: %v", err)
    }
}

func TestAuthenticationError(t *testing.T) {
    mock := testing.NewMockProvider().
        WithError(core.ErrUnauthorized)

    client := core.NewClient(mock)

    _, err := client.Chat("model").
        User("This will fail").
        GetResponse(context.Background())

    if !errors.Is(err, core.ErrUnauthorized) {
        t.Errorf("expected auth error, got: %v", err)
    }
}

Streaming Mocks

Mock streaming responses with controlled chunks:

func TestStreamingHandler(t *testing.T) {
    mock := testing.NewMockProvider().
        WithStreamingResponse(
            []string{"Hello", " ", "world", "!"},
            &core.ChatResponse{
                ID:     "stream-resp",
                Model:  "mock-model",
                Output: "Hello world!",
                Usage:  core.TokenUsage{TotalTokens: 5},
            },
        )

    client := core.NewClient(mock)

    stream, err := client.Chat("model").
        User("Stream something").
        Stream(context.Background())

    if err != nil {
        t.Fatalf("unexpected error: %v", err)
    }

    var chunks []string
    for chunk := range stream.Ch {
        chunks = append(chunks, chunk.Delta)
    }

    if err := <-stream.Err; err != nil {
        t.Fatalf("stream error: %v", err)
    }

    expected := []string{"Hello", " ", "world", "!"}
    if !reflect.DeepEqual(chunks, expected) {
        t.Errorf("chunks = %v, want %v", chunks, expected)
    }
}

Inspecting Calls

Verify that your code makes the expected requests:

func TestRequestInspection(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{Output: "OK"})

    client := core.NewClient(mock)
    ctx := context.Background()

    // Make some calls
    client.Chat("gpt-4o").
        System("You are helpful.").
        User("Hello").
        Temperature(0.7).
        GetResponse(ctx)

    client.Chat("gpt-4o").
        User("Goodbye").
        GetResponse(ctx)

    // Inspect recorded calls
    calls := mock.Calls()

    if len(calls) != 2 {
        t.Fatalf("expected 2 calls, got %d", len(calls))
    }

    // Check first call
    first := calls[0]
    if first.Request.Model != "gpt-4o" {
        t.Errorf("wrong model: %s", first.Request.Model)
    }
    if len(first.Request.Messages) != 2 {
        t.Errorf("expected 2 messages, got %d", len(first.Request.Messages))
    }

    // Check system message
    if first.Request.Messages[0].Role != core.RoleSystem {
        t.Error("first message should be system")
    }
}

RecordingProvider

RecordingProvider wraps a real provider and records all interactions. This is useful for:

Debugging production issues
Creating test fixtures from real responses
Analyzing actual API usage patterns

Basic Recording

package main

import (
    "context"
    "fmt"
    "os"
    "time"

    "github.com/petal-labs/iris/core"
    "github.com/petal-labs/iris/providers/openai"
    "github.com/petal-labs/iris/testing"
)

func main() {
    // Wrap real provider with recorder
    realProvider := openai.New(os.Getenv("OPENAI_API_KEY"))
    recorder := testing.NewRecordingProvider(realProvider)

    client := core.NewClient(recorder)
    ctx := context.Background()

    // Make API calls as normal
    _, _ = client.Chat("gpt-4o-mini").
        User("What is the capital of France?").
        GetResponse(ctx)

    _, _ = client.Chat("gpt-4o-mini").
        User("And what about Germany?").
        GetResponse(ctx)

    // Inspect recorded interactions
    for i, rec := range recorder.Recordings() {
        fmt.Printf("Call %d:\n", i+1)
        fmt.Printf("  Method:   %s\n", rec.Method)
        fmt.Printf("  Duration: %v\n", rec.Duration)
        fmt.Printf("  Model:    %s\n", rec.Request.Model)

        if rec.Response != nil {
            fmt.Printf("  Output:   %s\n", truncate(rec.Response.Output, 50))
            fmt.Printf("  Tokens:   %d\n", rec.Response.Usage.TotalTokens)
        }

        if rec.Error != nil {
            fmt.Printf("  Error:    %v\n", rec.Error)
        }
    }
}

Recording in Tests

Use recordings to verify API interactions in integration tests:

func TestAPIIntegration(t *testing.T) {
    if os.Getenv("OPENAI_API_KEY") == "" {
        t.Skip("OPENAI_API_KEY not set")
    }

    provider := openai.New(os.Getenv("OPENAI_API_KEY"))
    recorder := testing.NewRecordingProvider(provider)
    client := core.NewClient(recorder)

    // Run your code
    result := myApp.ProcessQuery(client, "test query")

    // Verify recordings
    recordings := recorder.Recordings()

    if len(recordings) == 0 {
        t.Error("expected at least one API call")
    }

    // Check that we used the right model
    for _, rec := range recordings {
        if rec.Request.Model != "gpt-4o-mini" {
            t.Errorf("expected gpt-4o-mini, got %s", rec.Request.Model)
        }
    }

    // Verify no errors
    for i, rec := range recordings {
        if rec.Error != nil {
            t.Errorf("call %d failed: %v", i, rec.Error)
        }
    }
}

Clearing Recordings

Clear recordings between test cases:

func TestMultipleScenarios(t *testing.T) {
    provider := testing.NewMockProvider().
        WithDefaultResponse(core.ChatResponse{Output: "OK"})
    recorder := testing.NewRecordingProvider(provider)
    client := core.NewClient(recorder)

    t.Run("scenario1", func(t *testing.T) {
        recorder.Clear() // Start fresh

        // Test scenario 1
        client.Chat("model").User("Test 1").GetResponse(context.Background())

        if len(recorder.Recordings()) != 1 {
            t.Error("expected 1 recording")
        }
    })

    t.Run("scenario2", func(t *testing.T) {
        recorder.Clear() // Start fresh

        // Test scenario 2
        client.Chat("model").User("Test 2a").GetResponse(context.Background())
        client.Chat("model").User("Test 2b").GetResponse(context.Background())

        if len(recorder.Recordings()) != 2 {
            t.Error("expected 2 recordings")
        }
    })
}

Testing Patterns

Table-Driven Tests

func TestPromptVariations(t *testing.T) {
    tests := []struct {
        name     string
        prompt   string
        response string
        wantErr  bool
    }{
        {
            name:     "greeting",
            prompt:   "Hello",
            response: "Hi there!",
        },
        {
            name:     "farewell",
            prompt:   "Goodbye",
            response: "See you later!",
        },
        {
            name:    "error case",
            prompt:  "trigger error",
            wantErr: true,
        },
    }

    for _, tt := range tests {
        t.Run(tt.name, func(t *testing.T) {
            var mock *testing.MockProvider
            if tt.wantErr {
                mock = testing.NewMockProvider().
                    WithError(errors.New("simulated error"))
            } else {
                mock = testing.NewMockProvider().
                    WithResponse(core.ChatResponse{Output: tt.response})
            }

            client := core.NewClient(mock)
            resp, err := client.Chat("model").
                User(tt.prompt).
                GetResponse(context.Background())

            if tt.wantErr {
                if err == nil {
                    t.Error("expected error, got nil")
                }
                return
            }

            if err != nil {
                t.Fatalf("unexpected error: %v", err)
            }

            if resp.Output != tt.response {
                t.Errorf("output = %q, want %q", resp.Output, tt.response)
            }
        })
    }
}

Dependency Injection

Structure your code for testability:

// production code
type ChatService struct {
    client *core.Client
}

func NewChatService(provider core.Provider) *ChatService {
    return &ChatService{
        client: core.NewClient(provider),
    }
}

func (s *ChatService) Summarize(ctx context.Context, text string) (string, error) {
    resp, err := s.client.Chat("gpt-4o-mini").
        System("Summarize the following text in one sentence.").
        User(text).
        GetResponse(ctx)
    if err != nil {
        return "", err
    }
    return resp.Output, nil
}

// test code
func TestSummarize(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{
            Output: "This is a summary.",
        })

    service := NewChatService(mock)

    result, err := service.Summarize(context.Background(), "Long text...")

    if err != nil {
        t.Fatalf("unexpected error: %v", err)
    }
    if result != "This is a summary." {
        t.Errorf("unexpected result: %s", result)
    }

    // Verify the request
    calls := mock.Calls()
    if calls[0].Request.Messages[0].Content != "Summarize the following text in one sentence." {
        t.Error("wrong system prompt")
    }
}

Testing Tool Calls

func TestToolCallingFlow(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{
            ID:     "resp-1",
            Output: "",
            ToolCalls: []core.ToolCall{
                {
                    ID:   "call-1",
                    Name: "get_weather",
                    Arguments: json.RawMessage(`{"location": "Tokyo"}`),
                },
            },
        }).
        WithResponse(core.ChatResponse{
            ID:     "resp-2",
            Output: "The weather in Tokyo is sunny and 22°C.",
        })

    client := core.NewClient(mock)
    ctx := context.Background()

    // First call - should get tool call
    resp1, _ := client.Chat("gpt-4o").
        User("What's the weather in Tokyo?").
        Tools(weatherTool).
        GetResponse(ctx)

    if len(resp1.ToolCalls) != 1 {
        t.Fatalf("expected 1 tool call, got %d", len(resp1.ToolCalls))
    }

    // Second call - with tool result
    resp2, _ := client.Chat("gpt-4o").
        User("What's the weather in Tokyo?").
        Tools(weatherTool).
        ToolResults(core.ToolResult{
            CallID:  "call-1",
            Content: `{"temperature": 22, "condition": "sunny"}`,
        }).
        GetResponse(ctx)

    if resp2.Output == "" {
        t.Error("expected final response with output")
    }
}

Best Practices

1. Use Mocks for Unit Tests

// Unit test - fast, deterministic
func TestBusinessLogic(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{Output: "expected"})
    // Test your logic
}

2. Use Recordings for Integration Tests

// Integration test - verifies real API behavior
func TestIntegration(t *testing.T) {
    if testing.Short() {
        t.Skip("skipping integration test")
    }

    provider := openai.New(os.Getenv("OPENAI_API_KEY"))
    recorder := testing.NewRecordingProvider(provider)
    // Test with real API
}

3. Assert on Request Details

func TestRequestConstruction(t *testing.T) {
    mock := testing.NewMockProvider().
        WithResponse(core.ChatResponse{Output: "OK"})

    client := core.NewClient(mock)
    myFunction(client)

    calls := mock.Calls()
    req := calls[0].Request

    // Verify request was constructed correctly
    assert.Equal(t, "gpt-4o", req.Model)
    assert.Equal(t, 0.7, *req.Temperature)
    assert.Len(t, req.Messages, 2)
}

4. Test Error Recovery

func TestErrorRecovery(t *testing.T) {
    // First call fails, second succeeds
    mock := testing.NewMockProvider().
        WithError(core.ErrRateLimited).
        WithResponse(core.ChatResponse{Output: "Success"})

    // Your retry logic should handle this
    result := myAppWithRetry(core.NewClient(mock))

    if result != "Success" {
        t.Error("retry should have succeeded")
    }
}

Next Steps

Batch API

Test batch operations. Batch API →

Tools Guide

Test tool calling flows. Tools →

Examples

See testing patterns in examples. Examples →

API Reference

Full testing package API. API →