Tools Guide
Learn tool calling with local models. Tools →
The Ollama provider connects Iris to locally running or cloud-hosted Ollama instances. Run models like Llama 3.1, Mistral, and Code Llama on your own hardware for privacy, cost savings, and offline capabilities.
package main
import ( "context" "fmt"
"github.com/petal-labs/iris/core" "github.com/petal-labs/iris/providers/ollama")
func main() { provider := ollama.NewLocal() client := core.NewClient(provider)
resp, err := client.Chat("llama3.1"). System("You are a helpful assistant."). User("What is the capital of France?"). GetResponse(context.Background())
if err != nil { panic(err) } fmt.Println(resp.Output)}Before using the Ollama provider, install Ollama on your system:
# Using Homebrewbrew install ollama
# Or download from ollama.comcurl -fsSL https://ollama.com/install.sh | shcurl -fsSL https://ollama.com/install.sh | shDownload the installer from ollama.com
Then start the Ollama service:
# Start Ollama serverollama serve
# Pull a modelollama pull llama3.1# No configuration needed for local!# Default: http://localhost:11434
# Optional: Override the hostexport OLLAMA_HOST=http://192.168.1.100:11434# Store in the encrypted keystoreiris keys set ollama
# Or use an environment variableexport OLLAMA_API_KEY=...import "github.com/petal-labs/iris/providers/ollama"// Local instance (default: http://localhost:11434)provider := ollama.NewLocal()
// Local instance with custom host via OLLAMA_HOST env varos.Setenv("OLLAMA_HOST", "http://192.168.1.100:11434")provider := ollama.NewLocal()
// Cloud instance from OLLAMA_API_KEYprovider, err := ollama.NewCloudFromEnv()
// Manual configurationprovider := ollama.New( ollama.WithBaseURL("http://my-server:11434"), ollama.WithCloud(), ollama.WithAPIKey("..."),)| Option | Description | Default |
|---|---|---|
WithBaseURL(url) | Override the API base URL | http://localhost:11434 |
WithCloud() | Enable cloud mode (adds auth headers) | Disabled |
WithAPIKey(key) | Set the API key for cloud mode | None |
WithHTTPClient(client) | Use a custom *http.Client | Default client |
WithHeader(key, value) | Add a custom HTTP header | None |
WithTimeout(duration) | Set the request timeout | 120 seconds |
provider := ollama.New( ollama.WithBaseURL("http://gpu-server:11434"), ollama.WithTimeout(5 * time.Minute),)| Feature | Supported | Notes |
|---|---|---|
| Chat | ✓ | All Ollama models |
| Streaming | ✓ | Real-time token streaming |
| Tool calling | ✓ | Model-dependent |
| Vision | ✓ | LLaVA, BakLLaVA, etc. |
| Reasoning | ✓ | DeepSeek-R1, etc. |
| Image generation | Not supported | |
| Embeddings | ✓ | nomic-embed-text, all-minilm |
| Model | Parameters | Context | Best For |
|---|---|---|---|
llama3.1 | 8B | 128K | General purpose |
llama3.1:70b | 70B | 128K | Complex tasks |
llama3.2 | 1B/3B | 128K | Fast, lightweight |
mistral | 7B | 32K | Balanced performance |
mixtral | 8x7B | 32K | High quality MoE |
codellama | 7B/13B/34B | 16K | Code generation |
deepseek-coder-v2 | 16B/236B | 128K | Advanced coding |
phi3 | 3.8B | 128K | Microsoft’s compact model |
gemma2 | 2B/9B/27B | 8K | Google’s open model |
qwen2.5 | 0.5B-72B | 128K | Alibaba’s multilingual |
| Model | Parameters | Best For |
|---|---|---|
llava | 7B/13B | Image analysis |
llava-llama3 | 8B | Vision + Llama 3 |
bakllava | 7B | Image understanding |
llava-phi3 | 3.8B | Lightweight vision |
| Model | Parameters | Best For |
|---|---|---|
deepseek-r1 | 1.5B-671B | Step-by-step reasoning |
qwq | 32B | Mathematical reasoning |
| Model | Dimensions | Best For |
|---|---|---|
nomic-embed-text | 768 | General embeddings |
all-minilm | 384 | Lightweight embeddings |
mxbai-embed-large | 1024 | High quality |
resp, err := client.Chat("llama3.1"). System("You are a helpful coding assistant."). User("Write a function to reverse a string in Go."). Temperature(0.7). MaxTokens(500). GetResponse(ctx)
if err != nil { log.Fatal(err)}fmt.Println(resp.Output)Stream responses for real-time output:
stream, err := client.Chat("llama3.1"). System("You are a helpful assistant."). User("Explain Go's concurrency model."). GetStream(ctx)
if err != nil { log.Fatal(err)}
for chunk := range stream.Ch { fmt.Print(chunk.Content)}fmt.Println()
// Check for streaming errorsif err := <-stream.Err; err != nil { log.Fatal(err)}Use vision models like LLaVA for image analysis:
// First, pull a vision model// ollama pull llava
imageData, err := os.ReadFile("photo.png")if err != nil { log.Fatal(err)}base64Data := base64.StdEncoding.EncodeToString(imageData)
resp, err := client.Chat("llava"). UserMultimodal(). Text("What's in this image?"). ImageBase64(base64Data, "image/png"). Done(). GetResponse(ctx)
fmt.Println(resp.Output)resp, err := client.Chat("llava-llama3"). UserMultimodal(). Text("Compare these two images."). ImageBase64(image1Data, "image/png"). ImageBase64(image2Data, "image/png"). Done(). GetResponse(ctx)Use reasoning models for step-by-step problem solving:
// Pull a reasoning model// ollama pull deepseek-r1:8b
resp, err := client.Chat("deepseek-r1:8b"). User("Solve this step by step: If x + 5 = 12, what is x?"). GetResponse(ctx)
// DeepSeek R1 shows its reasoning in the outputfmt.Println(resp.Output)Ollama supports tool calling with compatible models:
weatherTool := core.Tool{ Name: "get_weather", Description: "Get current weather for a location", Parameters: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "location": map[string]interface{}{ "type": "string", "description": "City name", }, }, "required": []string{"location"}, },}
// Use a tool-capable modelresp, err := client.Chat("llama3.1"). User("What's the weather in Tokyo?"). Tools(weatherTool). GetResponse(ctx)
if len(resp.ToolCalls) > 0 { // Handle tool call call := resp.ToolCalls[0] result := getWeather(call.Arguments)
// Continue conversation finalResp, err := client.Chat("llama3.1"). User("What's the weather in Tokyo?"). Tools(weatherTool). Assistant(resp.Output). ToolCall(call.ID, call.Name, call.Arguments). ToolResult(call.ID, result). GetResponse(ctx)
fmt.Println(finalResp.Output)}Generate embeddings for RAG and semantic search:
// Pull an embedding model// ollama pull nomic-embed-text
resp, err := provider.Embeddings(ctx, &core.EmbeddingRequest{ Model: "nomic-embed-text", Input: []core.EmbeddingInput{ {Text: "Go is a statically typed language."}, {Text: "Python is dynamically typed."}, },})
if err != nil { log.Fatal(err)}
for i, emb := range resp.Embeddings { fmt.Printf("Embedding %d: %d dimensions\n", i, len(emb.Values))}models, err := provider.ListModels(ctx)if err != nil { log.Fatal(err)}
for _, model := range models { fmt.Printf("%s (%s)\n", model.Name, model.Size)}err := provider.PullModel(ctx, "llama3.1:70b", func(progress ollama.PullProgress) { fmt.Printf("Downloading: %.1f%%\n", progress.Percent)})if err != nil { log.Fatal(err)}err := provider.DeleteModel(ctx, "old-model")if err != nil { log.Fatal(err)}Fine-tune model behavior:
resp, err := client.Chat("llama3.1"). User("Write a creative story."). Temperature(0.9). // Higher for creativity TopP(0.95). // Nucleus sampling TopK(40). // Top-k sampling RepetitionPenalty(1.1). // Reduce repetition NumPredict(500). // Max tokens GetResponse(ctx)// Use more context for long conversationsresp, err := client.Chat("llama3.1"). User(longPrompt). NumCtx(8192). // Increase context window GetResponse(ctx)// First turnresp1, _ := client.Chat("llama3.1"). System("You are a helpful Go tutor."). User("What is a goroutine?"). GetResponse(ctx)
// Second turn with historyresp2, _ := client.Chat("llama3.1"). System("You are a helpful Go tutor."). User("What is a goroutine?"). Assistant(resp1.Output). User("How is it different from a thread?"). GetResponse(ctx)// Connect to Ollama on another machineprovider := ollama.New( ollama.WithBaseURL("http://gpu-server.local:11434"), ollama.WithTimeout(5 * time.Minute),)services: ollama: image: ollama/ollama ports: - "11434:11434" volumes: - ollama:/root/.ollama deploy: resources: reservations: devices: - capabilities: [gpu]
volumes: ollama:// Connect to Docker-hosted Ollamaprovider := ollama.New( ollama.WithBaseURL("http://localhost:11434"),)Ollama automatically uses GPU when available:
# Check GPU statusollama run llama3.1 --verbose# Look for: "using CUDA" or "using Metal"// For large models, increase keep-aliveresp, err := client.Chat("llama3.1:70b"). User(prompt). KeepAlive("30m"). // Keep model in memory GetResponse(ctx)resp, err := client.Chat("llama3.1").User(prompt).GetResponse(ctx)if err != nil { // Check if Ollama is running if strings.Contains(err.Error(), "connection refused") { log.Fatal("Ollama is not running. Start with: ollama serve") }
// Check if model is available if strings.Contains(err.Error(), "model not found") { log.Fatal("Model not installed. Run: ollama pull llama3.1") }
var apiErr *core.APIError if errors.As(err, &apiErr) { log.Printf("API error %d: %s", apiErr.StatusCode, apiErr.Message) }
if errors.Is(err, context.DeadlineExceeded) { log.Println("Request timed out - try a smaller model or increase timeout") }}| Hardware | Recommended |
|---|---|
| 8GB RAM | 3B-7B models |
| 16GB RAM | 7B-13B models |
| 32GB+ RAM | 13B-70B models |
| GPU with 8GB VRAM | 7B-13B models |
| GPU with 24GB+ VRAM | 70B+ models |
# Use quantized models for better performanceollama pull llama3.1:8b-instruct-q4_0 # 4-bit quantizationollama pull llama3.1:8b-instruct-q8_0 # 8-bit quantization// Keep model in GPU memory between requestsresp, err := client.Chat("llama3.1"). User(prompt). KeepAlive("1h"). GetResponse(ctx)// Process multiple prompts efficientlyvar wg sync.WaitGroupfor _, prompt := range prompts { wg.Add(1) go func(p string) { defer wg.Done() resp, _ := client.Chat("llama3.1").User(p).GetResponse(ctx) // Process response }(prompt)}wg.Wait()// Use fast models for developmentprovider := ollama.NewLocal()client := core.NewClient(provider)
// Quick iteration with small modelresp, err := client.Chat("llama3.2:3b"). // Fast for testing User(prompt). GetResponse(ctx)// More robust configuration for productionprovider := ollama.New( ollama.WithBaseURL(os.Getenv("OLLAMA_HOST")), ollama.WithTimeout(5 * time.Minute),)
client := core.NewClient(provider, core.WithRetryPolicy(&core.RetryPolicy{ MaxRetries: 3, InitialInterval: 1 * time.Second, MaxInterval: 30 * time.Second, }),)// Use Ollama locally, fall back to cloud providerlocalProvider := ollama.NewLocal()cloudProvider, _ := openai.NewFromEnv()
// Try local firstresp, err := core.NewClient(localProvider). Chat("llama3.1"). User(prompt). GetResponse(ctx)
if err != nil { // Fall back to OpenAI resp, err = core.NewClient(cloudProvider). Chat("gpt-4o-mini"). User(prompt). GetResponse(ctx)}NewLocal() checks the OLLAMA_HOST env var before defaulting to localhost:11434Authorization: Bearer header to requestsollama pull <model>Tools Guide
Learn tool calling with local models. Tools →
Streaming Guide
Master streaming responses. Streaming →
Images Guide
Work with vision models. Images →
Providers Overview
Compare all available providers. Providers →