Tools Guide
Learn tool calling patterns. Tools →
The Hugging Face provider connects Iris to thousands of models hosted on the Hugging Face Inference API. Access open-source models like Llama, Mistral, Falcon, and many more through a unified interface.
package main
import ( "context" "fmt" "os"
"github.com/petal-labs/iris/core" "github.com/petal-labs/iris/providers/huggingface")
func main() { provider := huggingface.New(os.Getenv("HF_TOKEN")) client := core.NewClient(provider)
resp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct"). User("Explain the attention mechanism in transformers."). GetResponse(context.Background())
if err != nil { panic(err) } fmt.Println(resp.Output)}# Store in the encrypted keystore (recommended)iris keys set huggingface# Prompts for: Enter API key for huggingface: hf_...# Primary variableexport HF_TOKEN=hf_...
# Fallback (also accepted)export HUGGINGFACE_TOKEN=hf_...import "github.com/petal-labs/iris/providers/huggingface"// From a token stringprovider := huggingface.New("hf_...")
// From HF_TOKEN or HUGGINGFACE_TOKEN environment variableprovider, err := huggingface.NewFromEnv()if err != nil { log.Fatal("HF_TOKEN not set:", err)}
// From the Iris keystoreprovider, err := huggingface.NewFromKeystore()| Option | Description | Default |
|---|---|---|
WithBaseURL(url) | Override the Inference API base URL | Auto-resolved per model |
WithHubAPIBaseURL(url) | Override the Hub API base URL | https://huggingface.co |
WithHTTPClient(client) | Use a custom *http.Client | Default client |
WithHeader(key, value) | Add a custom HTTP header | None |
WithTimeout(duration) | Set the request timeout | 120 seconds |
provider := huggingface.New("hf_...", huggingface.WithTimeout(180 * time.Second),)| Feature | Supported | Notes |
|---|---|---|
| Chat | ✓ | Instruction-tuned models |
| Streaming | ✓ | Real-time token streaming |
| Tool calling | ✓ | Model-dependent |
| Vision | ✓ | Multimodal models |
| Image generation | Not supported | |
| Embeddings | Not supported |
Hugging Face hosts thousands of models. Here are some popular options:
| Model ID | Parameters | Best For |
|---|---|---|
meta-llama/Llama-3.1-70B-Instruct | 70B | Complex reasoning |
meta-llama/Llama-3.1-8B-Instruct | 8B | General purpose |
meta-llama/Llama-3.2-3B-Instruct | 3B | Fast, lightweight |
meta-llama/Llama-3.2-1B-Instruct | 1B | Ultra-fast |
| Model ID | Parameters | Best For |
|---|---|---|
mistralai/Mistral-7B-Instruct-v0.3 | 7B | Balanced performance |
mistralai/Mixtral-8x7B-Instruct-v0.1 | 8x7B | High quality MoE |
mistralai/Mistral-Nemo-Instruct-2407 | 12B | Latest Mistral |
| Model ID | Parameters | Best For |
|---|---|---|
google/gemma-2-27b-it | 27B | Complex tasks |
google/gemma-2-9b-it | 9B | General purpose |
google/gemma-2-2b-it | 2B | Fast inference |
| Model ID | Parameters | Best For |
|---|---|---|
microsoft/Phi-3.5-mini-instruct | 3.8B | Compact, capable |
microsoft/Phi-3-medium-4k-instruct | 14B | Medium tasks |
| Model ID | Parameters | Best For |
|---|---|---|
Qwen/Qwen2.5-72B-Instruct | 72B | Multilingual |
deepseek-ai/DeepSeek-Coder-V2-Instruct | Various | Code generation |
nvidia/Llama-3.1-Nemotron-70B-Instruct-HF | 70B | NVIDIA optimized |
resp, err := client.Chat("meta-llama/Llama-3.1-8B-Instruct"). System("You are a helpful coding assistant."). User("Write a function to calculate Fibonacci numbers in Go."). Temperature(0.7). MaxTokens(500). GetResponse(ctx)
if err != nil { log.Fatal(err)}fmt.Println(resp.Output)stream, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct"). System("You are a helpful assistant."). User("Explain how transformers work in machine learning."). GetStream(ctx)
if err != nil { log.Fatal(err)}
for chunk := range stream.Ch { fmt.Print(chunk.Content)}fmt.Println()
if err := <-stream.Err; err != nil { log.Fatal(err)}Use vision-capable models for image analysis:
imageData, err := os.ReadFile("photo.png")if err != nil { log.Fatal(err)}base64Data := base64.StdEncoding.EncodeToString(imageData)
resp, err := client.Chat("meta-llama/Llama-3.2-11B-Vision-Instruct"). UserMultimodal(). Text("What's in this image?"). ImageBase64(base64Data, "image/png"). Done(). GetResponse(ctx)
fmt.Println(resp.Output)Tool calling support depends on the model:
weatherTool := core.Tool{ Name: "get_weather", Description: "Get current weather for a location", Parameters: map[string]interface{}{ "type": "object", "properties": map[string]interface{}{ "location": map[string]interface{}{ "type": "string", "description": "City name", }, }, "required": []string{"location"}, },}
// Use a model that supports tool callingresp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct"). User("What's the weather in Tokyo?"). Tools(weatherTool). GetResponse(ctx)
if len(resp.ToolCalls) > 0 { call := resp.ToolCalls[0] result := getWeather(call.Arguments)
finalResp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct"). User("What's the weather in Tokyo?"). Tools(weatherTool). Assistant(resp.Output). ToolCall(call.ID, call.Name, call.Arguments). ToolResult(call.ID, result). GetResponse(ctx)
fmt.Println(finalResp.Output)}// Select model based on task complexityfunc selectModel(task string, complexity string) string { switch complexity { case "simple": return "meta-llama/Llama-3.2-3B-Instruct" case "medium": return "meta-llama/Llama-3.1-8B-Instruct" case "complex": return "meta-llama/Llama-3.1-70B-Instruct" default: return "meta-llama/Llama-3.1-8B-Instruct" }}
model := selectModel("coding", "complex")resp, err := client.Chat(model).User(prompt).GetResponse(ctx)// List available models (requires Hub API access)models, err := provider.ListModels(ctx, huggingface.ListModelsOptions{ Pipeline: "text-generation", Inference: "warm", // Only models ready for inference})
for _, m := range models { fmt.Printf("%s - %s\n", m.ID, m.Downloads)}For production workloads, use dedicated Inference Endpoints:
// Connect to a dedicated Inference Endpointprovider := huggingface.New("hf_...", huggingface.WithBaseURL("https://your-endpoint.endpoints.huggingface.cloud"),)
// Use as normalresp, err := client.Chat(""). // Model is determined by endpoint User(prompt). GetResponse(ctx)// First turnresp1, _ := client.Chat("meta-llama/Llama-3.1-8B-Instruct"). System("You are a helpful programming tutor."). User("What is recursion?"). GetResponse(ctx)
// Second turn with historyresp2, _ := client.Chat("meta-llama/Llama-3.1-8B-Instruct"). System("You are a helpful programming tutor."). User("What is recursion?"). Assistant(resp1.Output). User("Give me a Go example."). GetResponse(ctx)resp, err := client.Chat(model).User(prompt).GetResponse(ctx)if err != nil { // Check for model loading if strings.Contains(err.Error(), "loading") { log.Println("Model is loading, please retry in a moment") // Implement retry logic }
// Check for rate limits var apiErr *core.APIError if errors.As(err, &apiErr) { switch apiErr.StatusCode { case 401: log.Fatal("Invalid HF token") case 403: log.Fatal("Token doesn't have Inference API permission") case 429: log.Printf("Rate limited. Retry after: %s", apiErr.RetryAfter) case 503: log.Println("Model is loading, retry later") } }
if errors.Is(err, context.DeadlineExceeded) { log.Println("Request timed out - model may be loading") }}Some models need to be “warmed up” before use:
// Check if model is readyready, err := provider.IsModelReady(ctx, "meta-llama/Llama-3.1-70B-Instruct")if err != nil { log.Fatal(err)}
if !ready { // Trigger model loading _, err := provider.WarmModel(ctx, "meta-llama/Llama-3.1-70B-Instruct") if err != nil { log.Printf("Model loading: %v", err) }
// Wait and retry time.Sleep(30 * time.Second)}
// Now use the modelresp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct"). User(prompt). GetResponse(ctx)| Task | Recommended Model |
|---|---|
| General chat | meta-llama/Llama-3.1-8B-Instruct |
| Complex reasoning | meta-llama/Llama-3.1-70B-Instruct |
| Code generation | deepseek-ai/DeepSeek-Coder-V2-Instruct |
| Fast responses | meta-llama/Llama-3.2-3B-Instruct |
| Multilingual | Qwen/Qwen2.5-72B-Instruct |
client := core.NewClient(provider, core.WithRetryPolicy(&core.RetryPolicy{ MaxRetries: 5, InitialInterval: 10 * time.Second, // Longer for model loading MaxInterval: 60 * time.Second, BackoffMultiplier: 2.0, RetryOn: []int{503, 429}, }),)// Larger models need longer timeoutsprovider := huggingface.New("hf_...", huggingface.WithTimeout(180 * time.Second), // 3 minutes)NewFromEnv() checks HF_TOKEN first, then falls back to HUGGINGFACE_TOKENAuthorization: Bearer for authenticationTools Guide
Learn tool calling patterns. Tools →
Streaming Guide
Master streaming responses. Streaming →
Providers Overview
Compare all available providers. Providers →