Skip to content

Hugging Face

The Hugging Face provider connects Iris to thousands of models hosted on the Hugging Face Inference API. Access open-source models like Llama, Mistral, Falcon, and many more through a unified interface.

package main
import (
"context"
"fmt"
"os"
"github.com/petal-labs/iris/core"
"github.com/petal-labs/iris/providers/huggingface"
)
func main() {
provider := huggingface.New(os.Getenv("HF_TOKEN"))
client := core.NewClient(provider)
resp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct").
User("Explain the attention mechanism in transformers.").
GetResponse(context.Background())
if err != nil {
panic(err)
}
fmt.Println(resp.Output)
}
Terminal window
# Store in the encrypted keystore (recommended)
iris keys set huggingface
# Prompts for: Enter API key for huggingface: hf_...
import "github.com/petal-labs/iris/providers/huggingface"
// From a token string
provider := huggingface.New("hf_...")
// From HF_TOKEN or HUGGINGFACE_TOKEN environment variable
provider, err := huggingface.NewFromEnv()
if err != nil {
log.Fatal("HF_TOKEN not set:", err)
}
// From the Iris keystore
provider, err := huggingface.NewFromKeystore()
OptionDescriptionDefault
WithBaseURL(url)Override the Inference API base URLAuto-resolved per model
WithHubAPIBaseURL(url)Override the Hub API base URLhttps://huggingface.co
WithHTTPClient(client)Use a custom *http.ClientDefault client
WithHeader(key, value)Add a custom HTTP headerNone
WithTimeout(duration)Set the request timeout120 seconds
provider := huggingface.New("hf_...",
huggingface.WithTimeout(180 * time.Second),
)
FeatureSupportedNotes
ChatInstruction-tuned models
StreamingReal-time token streaming
Tool callingModel-dependent
VisionMultimodal models
Image generationNot supported
EmbeddingsNot supported

Hugging Face hosts thousands of models. Here are some popular options:

Model IDParametersBest For
meta-llama/Llama-3.1-70B-Instruct70BComplex reasoning
meta-llama/Llama-3.1-8B-Instruct8BGeneral purpose
meta-llama/Llama-3.2-3B-Instruct3BFast, lightweight
meta-llama/Llama-3.2-1B-Instruct1BUltra-fast
Model IDParametersBest For
mistralai/Mistral-7B-Instruct-v0.37BBalanced performance
mistralai/Mixtral-8x7B-Instruct-v0.18x7BHigh quality MoE
mistralai/Mistral-Nemo-Instruct-240712BLatest Mistral
Model IDParametersBest For
google/gemma-2-27b-it27BComplex tasks
google/gemma-2-9b-it9BGeneral purpose
google/gemma-2-2b-it2BFast inference
Model IDParametersBest For
microsoft/Phi-3.5-mini-instruct3.8BCompact, capable
microsoft/Phi-3-medium-4k-instruct14BMedium tasks
Model IDParametersBest For
Qwen/Qwen2.5-72B-Instruct72BMultilingual
deepseek-ai/DeepSeek-Coder-V2-InstructVariousCode generation
nvidia/Llama-3.1-Nemotron-70B-Instruct-HF70BNVIDIA optimized
resp, err := client.Chat("meta-llama/Llama-3.1-8B-Instruct").
System("You are a helpful coding assistant.").
User("Write a function to calculate Fibonacci numbers in Go.").
Temperature(0.7).
MaxTokens(500).
GetResponse(ctx)
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.Output)
stream, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct").
System("You are a helpful assistant.").
User("Explain how transformers work in machine learning.").
GetStream(ctx)
if err != nil {
log.Fatal(err)
}
for chunk := range stream.Ch {
fmt.Print(chunk.Content)
}
fmt.Println()
if err := <-stream.Err; err != nil {
log.Fatal(err)
}

Use vision-capable models for image analysis:

imageData, err := os.ReadFile("photo.png")
if err != nil {
log.Fatal(err)
}
base64Data := base64.StdEncoding.EncodeToString(imageData)
resp, err := client.Chat("meta-llama/Llama-3.2-11B-Vision-Instruct").
UserMultimodal().
Text("What's in this image?").
ImageBase64(base64Data, "image/png").
Done().
GetResponse(ctx)
fmt.Println(resp.Output)

Tool calling support depends on the model:

weatherTool := core.Tool{
Name: "get_weather",
Description: "Get current weather for a location",
Parameters: map[string]interface{}{
"type": "object",
"properties": map[string]interface{}{
"location": map[string]interface{}{
"type": "string",
"description": "City name",
},
},
"required": []string{"location"},
},
}
// Use a model that supports tool calling
resp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct").
User("What's the weather in Tokyo?").
Tools(weatherTool).
GetResponse(ctx)
if len(resp.ToolCalls) > 0 {
call := resp.ToolCalls[0]
result := getWeather(call.Arguments)
finalResp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct").
User("What's the weather in Tokyo?").
Tools(weatherTool).
Assistant(resp.Output).
ToolCall(call.ID, call.Name, call.Arguments).
ToolResult(call.ID, result).
GetResponse(ctx)
fmt.Println(finalResp.Output)
}
// Select model based on task complexity
func selectModel(task string, complexity string) string {
switch complexity {
case "simple":
return "meta-llama/Llama-3.2-3B-Instruct"
case "medium":
return "meta-llama/Llama-3.1-8B-Instruct"
case "complex":
return "meta-llama/Llama-3.1-70B-Instruct"
default:
return "meta-llama/Llama-3.1-8B-Instruct"
}
}
model := selectModel("coding", "complex")
resp, err := client.Chat(model).User(prompt).GetResponse(ctx)
// List available models (requires Hub API access)
models, err := provider.ListModels(ctx, huggingface.ListModelsOptions{
Pipeline: "text-generation",
Inference: "warm", // Only models ready for inference
})
for _, m := range models {
fmt.Printf("%s - %s\n", m.ID, m.Downloads)
}

For production workloads, use dedicated Inference Endpoints:

// Connect to a dedicated Inference Endpoint
provider := huggingface.New("hf_...",
huggingface.WithBaseURL("https://your-endpoint.endpoints.huggingface.cloud"),
)
// Use as normal
resp, err := client.Chat(""). // Model is determined by endpoint
User(prompt).
GetResponse(ctx)
// First turn
resp1, _ := client.Chat("meta-llama/Llama-3.1-8B-Instruct").
System("You are a helpful programming tutor.").
User("What is recursion?").
GetResponse(ctx)
// Second turn with history
resp2, _ := client.Chat("meta-llama/Llama-3.1-8B-Instruct").
System("You are a helpful programming tutor.").
User("What is recursion?").
Assistant(resp1.Output).
User("Give me a Go example.").
GetResponse(ctx)
resp, err := client.Chat(model).User(prompt).GetResponse(ctx)
if err != nil {
// Check for model loading
if strings.Contains(err.Error(), "loading") {
log.Println("Model is loading, please retry in a moment")
// Implement retry logic
}
// Check for rate limits
var apiErr *core.APIError
if errors.As(err, &apiErr) {
switch apiErr.StatusCode {
case 401:
log.Fatal("Invalid HF token")
case 403:
log.Fatal("Token doesn't have Inference API permission")
case 429:
log.Printf("Rate limited. Retry after: %s", apiErr.RetryAfter)
case 503:
log.Println("Model is loading, retry later")
}
}
if errors.Is(err, context.DeadlineExceeded) {
log.Println("Request timed out - model may be loading")
}
}

Some models need to be “warmed up” before use:

// Check if model is ready
ready, err := provider.IsModelReady(ctx, "meta-llama/Llama-3.1-70B-Instruct")
if err != nil {
log.Fatal(err)
}
if !ready {
// Trigger model loading
_, err := provider.WarmModel(ctx, "meta-llama/Llama-3.1-70B-Instruct")
if err != nil {
log.Printf("Model loading: %v", err)
}
// Wait and retry
time.Sleep(30 * time.Second)
}
// Now use the model
resp, err := client.Chat("meta-llama/Llama-3.1-70B-Instruct").
User(prompt).
GetResponse(ctx)
TaskRecommended Model
General chatmeta-llama/Llama-3.1-8B-Instruct
Complex reasoningmeta-llama/Llama-3.1-70B-Instruct
Code generationdeepseek-ai/DeepSeek-Coder-V2-Instruct
Fast responsesmeta-llama/Llama-3.2-3B-Instruct
MultilingualQwen/Qwen2.5-72B-Instruct
client := core.NewClient(provider,
core.WithRetryPolicy(&core.RetryPolicy{
MaxRetries: 5,
InitialInterval: 10 * time.Second, // Longer for model loading
MaxInterval: 60 * time.Second,
BackoffMultiplier: 2.0,
RetryOn: []int{503, 429},
}),
)
// Larger models need longer timeouts
provider := huggingface.New("hf_...",
huggingface.WithTimeout(180 * time.Second), // 3 minutes
)
  • NewFromEnv() checks HF_TOKEN first, then falls back to HUGGINGFACE_TOKEN
  • Hugging Face hosts thousands of models - you specify the full model ID
  • Uses Authorization: Bearer for authentication
  • Some models require warming up before first use (503 responses)
  • The provider is safe for concurrent use after construction
  • Token must have “Make calls to Inference Providers” permission

Tools Guide

Learn tool calling patterns. Tools →

Providers Overview

Compare all available providers. Providers →