Ollama Local Model Integration
Ollama Local Model Integration
The Klira AI SDK provides seamless integration with Ollama for running local LLM models, automatically adding observability, guardrails, and policy enforcement to your locally-hosted AI applications.
Quick Start
import osimport ollamafrom klira.sdk import Klirafrom klira.sdk.decorators import workflow, guardrailsfrom klira.sdk.utils.context import set_hierarchy_context
# Initialize Klira AI SDKklira = Klira.init( app_name="Ollama-App", api_key=os.getenv("KLIRA_API_KEY"), enabled=True)
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="ollama_chat", organization_id="acme", project_id="local_ai")@guardrails(enforcement_level="strict")def ollama_completion(user_message: str, model: str = "llama2") -> str: """Create an Ollama completion with monitoring and governance."""
response = ollama.chat( model=model, messages=[{"role": "user", "content": user_message}] )
return response['message']['content']
# Usageresult = ollama_completion("Explain quantum computing in simple terms")print(result)Installation and Setup
Installing Ollama
# Windows (PowerShell as Administrator)# Download from https://ollama.ai and run installer
# Linuxcurl -fsSL https://ollama.ai/install.sh | sh
# macOSbrew install ollamaInstalling Python Client
pip install ollamaPulling Models
# Pull popular modelsollama pull llama2ollama pull codellamaollama pull mistralollama pull phi
# List available modelsollama listSupported Models
from klira.sdk.decorators import workflow, guardrailsfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="ollama_multi_model")@guardrails(enforcement_level="moderate")def ollama_multi_model_completion(message: str, use_case: str = "general") -> dict: """Ollama completion with model selection based on use case."""
models = { "general": "llama2", # General purpose conversations "code": "codellama", # Code generation and analysis "math": "mistral", # Mathematical reasoning "small": "phi", # Lightweight, fast responses "creative": "llama2:13b", # Creative writing (if available) "instruct": "llama2:7b-chat" # Instruction following }
selected_model = models.get(use_case, models["general"])
response = ollama.chat( model=selected_model, messages=[{"role": "user", "content": message}] )
return { "content": response['message']['content'], "model": selected_model, "use_case": use_case }Configuration
from klira.sdk.decorators import workflow, guardrailsfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="configured_ollama")@guardrails(enforcement_level="moderate")def configured_ollama_completion( message: str, model: str = "llama2", temperature: float = 0.7, max_tokens: int = 500) -> str: """Ollama completion with custom configuration."""
response = ollama.chat( model=model, messages=[{"role": "user", "content": message}], options={ "temperature": temperature, "num_predict": max_tokens, "top_p": 0.9, "top_k": 40 } )
return response['message']['content']Streaming
from klira.sdk.decorators import workflow, guardrailsfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="ollama_streaming")@guardrails(enforcement_level="moderate")def ollama_streaming_completion(message: str, model: str = "llama2") -> str: """Ollama completion with streaming response."""
response_chunks = []
stream = ollama.chat( model=model, messages=[{"role": "user", "content": message}], stream=True )
for chunk in stream: content = chunk['message']['content'] response_chunks.append(content) print(content, end="", flush=True)
print() return "".join(response_chunks)Error Handling
import timefrom ollama import ResponseErrorfrom klira.sdk.decorators import workflow, guardrailsfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="robust_ollama")@guardrails(enforcement_level="moderate")def robust_ollama_completion(message: str, model: str = "llama2", max_retries: int = 3) -> str: """Ollama completion with robust error handling."""
for attempt in range(max_retries): try: response = ollama.chat( model=model, messages=[{"role": "user", "content": message}] ) return response['message']['content']
except ResponseError as e: if "model not found" in str(e).lower(): try: print(f"Pulling model: {model}") ollama.pull(model) continue except Exception as pull_error: raise Exception(f"Failed to pull model {model}: {pull_error}")
if attempt < max_retries - 1: wait_time = 2 ** attempt time.sleep(wait_time) continue raise
except ConnectionError: if attempt < max_retries - 1: time.sleep(2) continue raise Exception("Could not connect to Ollama server. Is it running?")Model Management
from klira.sdk.decorators import workflowfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="ollama_model_management")def manage_ollama_models() -> dict: """Manage Ollama models programmatically."""
models = ollama.list()
def model_exists(model_name: str) -> bool: return any(model['name'].startswith(model_name) for model in models['models'])
def ensure_model(model_name: str) -> bool: if not model_exists(model_name): try: ollama.pull(model_name) return True except Exception as e: print(f"Failed to pull model {model_name}: {e}") return False return True
required_models = ["llama2", "codellama"] results = {}
for model in required_models: results[model] = ensure_model(model)
return { "available_models": [model['name'] for model in models['models']], "pull_results": results }Best Practices
Resource Management
import psutilfrom klira.sdk.decorators import workflowfrom klira.sdk.utils.context import set_hierarchy_context
# Set user contextset_hierarchy_context(user_id="user_123")
@workflow(name="resource_aware_ollama")def resource_aware_ollama_completion(message: str) -> dict: """Ollama completion with resource monitoring."""
cpu_percent = psutil.cpu_percent(interval=1) memory_percent = psutil.virtual_memory().percent
if cpu_percent > 90 or memory_percent > 85: return { "success": False, "error": "system_overloaded", "message": f"System resources too high: CPU {cpu_percent}%, Memory {memory_percent}%" }
response = ollama.chat( model="llama2", messages=[{"role": "user", "content": message}] )
return { "success": True, "content": response['message']['content'] }Server Health Check
import requests
def check_ollama_server() -> bool: """Check if Ollama server is running.""" try: response = requests.get("http://localhost:11434/api/version", timeout=5) return response.status_code == 200 except: return FalseRelated Documentation
- OpenAI Integration - OpenAI API integration
- Anthropic Integration - Claude API integration
- Custom LLM Providers - Adding new LLM support