Skip to content

Ollama

The Ollama provider connects Iris to locally running or cloud-hosted Ollama instances. Run models like Llama 3.1, Mistral, and Code Llama on your own hardware for privacy, cost savings, and offline capabilities.

package main
import (
"context"
"fmt"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/ollama"
)
func main() {
provider := ollama.NewLocal()
client := core.NewClient(provider)
resp, err := client.Chat("llama3.1").
System("You are a helpful assistant.").
User("What is the capital of France?").
GetResponse(context.Background())
if err != nil {
panic(err)
}
fmt.Println(resp.Output)
}

Before using the Ollama provider, install Ollama on your system:

Terminal window
# Using Homebrew
brew install ollama
# Or download from ollama.com
curl -fsSL https://ollama.com/install.sh | sh

Then start the Ollama service:

Terminal window
# Start Ollama server
ollama serve
# Pull a model
ollama pull llama3.1
Terminal window
# No configuration needed for local!
# Default: http://localhost:11434
# Optional: Override the host
export OLLAMA_HOST=http://192.168.1.100:11434
import "github.com/petal-labs/iris/providers/ollama"
// Local instance (default: http://localhost:11434)
provider := ollama.NewLocal()
// Local instance with custom host via OLLAMA_HOST env var
os.Setenv("OLLAMA_HOST", "http://192.168.1.100:11434")
provider := ollama.NewLocal()
// Cloud instance from OLLAMA_API_KEY
provider, err := ollama.NewCloudFromEnv()
// Manual configuration
provider := ollama.New(
ollama.WithBaseURL("http://my-server:11434"),
ollama.WithCloud(),
ollama.WithAPIKey("..."),
)
OptionDescriptionDefault
WithBaseURL(url)Override the API base URLhttp://localhost:11434
WithCloud()Enable cloud mode (adds auth headers)Disabled
WithAPIKey(key)Set the API key for cloud modeNone
WithHTTPClient(client)Use a custom *http.ClientDefault client
WithHeader(key, value)Add a custom HTTP headerNone
WithTimeout(duration)Set the request timeout120 seconds
provider := ollama.New(
ollama.WithBaseURL("http://gpu-server:11434"),
ollama.WithTimeout(5 * time.Minute),
)
FeatureSupportedNotes
ChatAll Ollama models
StreamingReal-time token streaming
Tool callingModel-dependent
VisionLLaVA, BakLLaVA, etc.
ReasoningDeepSeek-R1, etc.
Image generationNot supported
Embeddingsnomic-embed-text, all-minilm
ModelParametersContextBest For
llama3.18B128KGeneral purpose
llama3.1:70b70B128KComplex tasks
llama3.21B/3B128KFast, lightweight
mistral7B32KBalanced performance
mixtral8x7B32KHigh quality MoE
codellama7B/13B/34B16KCode generation
deepseek-coder-v216B/236B128KAdvanced coding
phi33.8B128KMicrosoft’s compact model
gemma22B/9B/27B8KGoogle’s open model
qwen2.50.5B-72B128KAlibaba’s multilingual
ModelParametersBest For
llava7B/13BImage analysis
llava-llama38BVision + Llama 3
bakllava7BImage understanding
llava-phi33.8BLightweight vision
ModelParametersBest For
deepseek-r11.5B-671BStep-by-step reasoning
qwq32BMathematical reasoning
ModelDimensionsBest For
nomic-embed-text768General embeddings
all-minilm384Lightweight embeddings
mxbai-embed-large1024High quality
resp, err := client.Chat("llama3.1").
System("You are a helpful coding assistant.").
User("Write a function to reverse a string in Go.").
Temperature(0.7).
MaxTokens(500).
GetResponse(ctx)
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.Output)

Stream responses for real-time output:

stream, err := client.Chat("llama3.1").
System("You are a helpful assistant.").
User("Explain Go's concurrency model.").
GetStream(ctx)
if err != nil {
log.Fatal(err)
}
for chunk := range stream.Ch {
fmt.Print(chunk.Content)
}
fmt.Println()
// Check for streaming errors
if err := <-stream.Err; err != nil {
log.Fatal(err)
}

Use vision models like LLaVA for image analysis:

// First, pull a vision model
// ollama pull llava
imageData, err := os.ReadFile("photo.png")
if err != nil {
log.Fatal(err)
}
base64Data := base64.StdEncoding.EncodeToString(imageData)
resp, err := client.Chat("llava").
UserMultimodal().
Text("What's in this image?").
ImageBase64(base64Data, "image/png").
Done().
GetResponse(ctx)
fmt.Println(resp.Output)
resp, err := client.Chat("llava-llama3").
UserMultimodal().
Text("Compare these two images.").
ImageBase64(image1Data, "image/png").
ImageBase64(image2Data, "image/png").
Done().
GetResponse(ctx)

Use reasoning models for step-by-step problem solving:

// Pull a reasoning model
// ollama pull deepseek-r1:8b
resp, err := client.Chat("deepseek-r1:8b").
User("Solve this step by step: If x + 5 = 12, what is x?").
GetResponse(ctx)
// DeepSeek R1 shows its reasoning in the output
fmt.Println(resp.Output)

Ollama supports tool calling with compatible models:

weatherTool := core.Tool{
Name: "get_weather",
Description: "Get current weather for a location",
Parameters: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{
"type": "string",
"description": "City name",
},
},
"required": []string{"location"},
},
}
// Use a tool-capable model
resp, err := client.Chat("llama3.1").
User("What's the weather in Tokyo?").
Tools(weatherTool).
GetResponse(ctx)
if len(resp.ToolCalls) > 0 {
// Handle tool call
call := resp.ToolCalls[0]
result := getWeather(call.Arguments)
// Continue conversation
finalResp, err := client.Chat("llama3.1").
User("What's the weather in Tokyo?").
Tools(weatherTool).
Assistant(resp.Output).
ToolCall(call.ID, call.Name, call.Arguments).
ToolResult(call.ID, result).
GetResponse(ctx)
fmt.Println(finalResp.Output)
}

Generate embeddings for RAG and semantic search:

// Pull an embedding model
// ollama pull nomic-embed-text
resp, err := provider.Embeddings(ctx, &core.EmbeddingRequest{
Model: "nomic-embed-text",
Input: []core.EmbeddingInput{
{Text: "Go is a statically typed language."},
{Text: "Python is dynamically typed."},
},
})
if err != nil {
log.Fatal(err)
}
for i, emb := range resp.Embeddings {
fmt.Printf("Embedding %d: %d dimensions\n", i, len(emb.Values))
}
models, err := provider.ListModels(ctx)
if err != nil {
log.Fatal(err)
}
for _, model := range models {
fmt.Printf("%s (%s)\n", model.Name, model.Size)
}
err := provider.PullModel(ctx, "llama3.1:70b", func(progress ollama.PullProgress) {
fmt.Printf("Downloading: %.1f%%\n", progress.Percent)
})
if err != nil {
log.Fatal(err)
}
err := provider.DeleteModel(ctx, "old-model")
if err != nil {
log.Fatal(err)
}

Fine-tune model behavior:

resp, err := client.Chat("llama3.1").
User("Write a creative story.").
Temperature(0.9). // Higher for creativity
TopP(0.95). // Nucleus sampling
TopK(40). // Top-k sampling
RepetitionPenalty(1.1). // Reduce repetition
NumPredict(500). // Max tokens
GetResponse(ctx)
// Use more context for long conversations
resp, err := client.Chat("llama3.1").
User(longPrompt).
NumCtx(8192). // Increase context window
GetResponse(ctx)
// First turn
resp1, _ := client.Chat("llama3.1").
System("You are a helpful Go tutor.").
User("What is a goroutine?").
GetResponse(ctx)
// Second turn with history
resp2, _ := client.Chat("llama3.1").
System("You are a helpful Go tutor.").
User("What is a goroutine?").
Assistant(resp1.Output).
User("How is it different from a thread?").
GetResponse(ctx)
// Connect to Ollama on another machine
provider := ollama.New(
ollama.WithBaseURL("http://gpu-server.local:11434"),
ollama.WithTimeout(5 * time.Minute),
)
docker-compose.yml
services:
ollama:
image: ollama/ollama
ports:
- "11434:11434"
volumes:
- ollama:/root/.ollama
deploy:
resources:
reservations:
devices:
- capabilities: [gpu]
volumes:
ollama:
// Connect to Docker-hosted Ollama
provider := ollama.New(
ollama.WithBaseURL("http://localhost:11434"),
)

Ollama automatically uses GPU when available:

Terminal window
# Check GPU status
ollama run llama3.1 --verbose
# Look for: "using CUDA" or "using Metal"
// For large models, increase keep-alive
resp, err := client.Chat("llama3.1:70b").
User(prompt).
KeepAlive("30m"). // Keep model in memory
GetResponse(ctx)
resp, err := client.Chat("llama3.1").User(prompt).GetResponse(ctx)
if err != nil {
// Check if Ollama is running
if strings.Contains(err.Error(), "connection refused") {
log.Fatal("Ollama is not running. Start with: ollama serve")
}
// Check if model is available
if strings.Contains(err.Error(), "model not found") {
log.Fatal("Model not installed. Run: ollama pull llama3.1")
}
var apiErr *core.APIError
if errors.As(err, &apiErr) {
log.Printf("API error %d: %s", apiErr.StatusCode, apiErr.Message)
}
if errors.Is(err, context.DeadlineExceeded) {
log.Println("Request timed out - try a smaller model or increase timeout")
}
}
HardwareRecommended
8GB RAM3B-7B models
16GB RAM7B-13B models
32GB+ RAM13B-70B models
GPU with 8GB VRAM7B-13B models
GPU with 24GB+ VRAM70B+ models
Terminal window
# Use quantized models for better performance
ollama pull llama3.1:8b-instruct-q4_0 # 4-bit quantization
ollama pull llama3.1:8b-instruct-q8_0 # 8-bit quantization
// Keep model in GPU memory between requests
resp, err := client.Chat("llama3.1").
User(prompt).
KeepAlive("1h").
GetResponse(ctx)
// Process multiple prompts efficiently
var wg sync.WaitGroup
for _, prompt := range prompts {
wg.Add(1)
go func(p string) {
defer wg.Done()
resp, _ := client.Chat("llama3.1").User(p).GetResponse(ctx)
// Process response
}(prompt)
}
wg.Wait()
// Use fast models for development
provider := ollama.NewLocal()
client := core.NewClient(provider)
// Quick iteration with small model
resp, err := client.Chat("llama3.2:3b"). // Fast for testing
User(prompt).
GetResponse(ctx)
// More robust configuration for production
provider := ollama.New(
ollama.WithBaseURL(os.Getenv("OLLAMA_HOST")),
ollama.WithTimeout(5 * time.Minute),
)
client := core.NewClient(provider,
core.WithRetryPolicy(&core.RetryPolicy{
MaxRetries: 3,
InitialInterval: 1 * time.Second,
MaxInterval: 30 * time.Second,
}),
)
// Use Ollama locally, fall back to cloud provider
localProvider := ollama.NewLocal()
cloudProvider, _ := openai.NewFromEnv()
// Try local first
resp, err := core.NewClient(localProvider).
Chat("llama3.1").
User(prompt).
GetResponse(ctx)
if err != nil {
// Fall back to OpenAI
resp, err = core.NewClient(cloudProvider).
Chat("gpt-4o-mini").
User(prompt).
GetResponse(ctx)
}
  • NewLocal() checks the OLLAMA_HOST env var before defaulting to localhost:11434
  • No API key is required for local instances
  • Cloud mode adds an Authorization: Bearer header to requests
  • Models must be pulled before use with ollama pull <model>
  • The provider is safe for concurrent use after construction
  • GPU acceleration is automatic when available

Tools Guide

Learn tool calling with local models. Tools →

Images Guide

Work with vision models. Images →

Providers Overview

Compare all available providers. Providers →