Skip to content

Ollama Local Model Integration

Ollama Local Model Integration

The Klira AI SDK provides seamless integration with Ollama for running local LLM models, automatically adding observability, guardrails, and policy enforcement to your locally-hosted AI applications.

Quick Start

import os
import ollama
from klira.sdk import Klira
from klira.sdk.decorators import workflow, guardrails
from klira.sdk.utils.context import set_hierarchy_context

# Initialize Klira AI SDK
klira = Klira.init(
    app_name="Ollama-App",
    api_key=os.getenv("KLIRA_API_KEY"),
    enabled=True
)

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="ollama_chat", organization_id="acme", project_id="local_ai")
@guardrails(enforcement_level="strict")
def ollama_completion(user_message: str, model: str = "llama2") -> str:
    """Create an Ollama completion with monitoring and governance."""

    response = ollama.chat(
    model=model,
    messages=[{"role": "user", "content": user_message}]
    )

    return response['message']['content']

# Usage
result = ollama_completion("Explain quantum computing in simple terms")
print(result)

Installation and Setup

Installing Ollama

# Windows (PowerShell as Administrator)
# Download from https://ollama.ai and run installer

# Linux
curl -fsSL https://ollama.ai/install.sh | sh

# macOS
brew install ollama

Installing Python Client

pip install ollama

Pulling Models

# Pull popular models
ollama pull llama2
ollama pull codellama
ollama pull mistral
ollama pull phi

# List available models
ollama list

Supported Models

from klira.sdk.decorators import workflow, guardrails
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="ollama_multi_model")
@guardrails(enforcement_level="moderate")
def ollama_multi_model_completion(message: str, use_case: str = "general") -> dict:
    """Ollama completion with model selection based on use case."""

    models = {
    "general": "llama2", # General purpose conversations
    "code": "codellama", # Code generation and analysis
    "math": "mistral", # Mathematical reasoning
    "small": "phi", # Lightweight, fast responses
    "creative": "llama2:13b", # Creative writing (if available)
    "instruct": "llama2:7b-chat" # Instruction following
    }

    selected_model = models.get(use_case, models["general"])

    response = ollama.chat(
    model=selected_model,
    messages=[{"role": "user", "content": message}]
    )

    return {
    "content": response['message']['content'],
    "model": selected_model,
    "use_case": use_case
    }

Configuration

from klira.sdk.decorators import workflow, guardrails
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="configured_ollama")
@guardrails(enforcement_level="moderate")
def configured_ollama_completion(
    message: str,
    model: str = "llama2",
    temperature: float = 0.7,
    max_tokens: int = 500
) -> str:
    """Ollama completion with custom configuration."""

    response = ollama.chat(
    model=model,
    messages=[{"role": "user", "content": message}],
    options={
    "temperature": temperature,
    "num_predict": max_tokens,
    "top_p": 0.9,
    "top_k": 40
    }
    )

    return response['message']['content']

Streaming

from klira.sdk.decorators import workflow, guardrails
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="ollama_streaming")
@guardrails(enforcement_level="moderate")
def ollama_streaming_completion(message: str, model: str = "llama2") -> str:
    """Ollama completion with streaming response."""

    response_chunks = []

    stream = ollama.chat(
    model=model,
    messages=[{"role": "user", "content": message}],
    stream=True
    )

    for chunk in stream:
    content = chunk['message']['content']
    response_chunks.append(content)
    print(content, end="", flush=True)

    print()
    return "".join(response_chunks)

Error Handling

import time
from ollama import ResponseError
from klira.sdk.decorators import workflow, guardrails
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="robust_ollama")
@guardrails(enforcement_level="moderate")
def robust_ollama_completion(message: str, model: str = "llama2", max_retries: int = 3) -> str:
    """Ollama completion with robust error handling."""

    for attempt in range(max_retries):
    try:
    response = ollama.chat(
    model=model,
    messages=[{"role": "user", "content": message}]
    )
    return response['message']['content']

    except ResponseError as e:
    if "model not found" in str(e).lower():
    try:
    print(f"Pulling model: {model}")
    ollama.pull(model)
    continue
    except Exception as pull_error:
    raise Exception(f"Failed to pull model {model}: {pull_error}")

    if attempt < max_retries - 1:
    wait_time = 2 ** attempt
    time.sleep(wait_time)
    continue
    raise

    except ConnectionError:
    if attempt < max_retries - 1:
    time.sleep(2)
    continue
    raise Exception("Could not connect to Ollama server. Is it running?")

Model Management

from klira.sdk.decorators import workflow
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="ollama_model_management")
def manage_ollama_models() -> dict:
    """Manage Ollama models programmatically."""

    models = ollama.list()

    def model_exists(model_name: str) -> bool:
    return any(model['name'].startswith(model_name) for model in models['models'])

    def ensure_model(model_name: str) -> bool:
    if not model_exists(model_name):
    try:
    ollama.pull(model_name)
    return True
    except Exception as e:
    print(f"Failed to pull model {model_name}: {e}")
    return False
    return True

    required_models = ["llama2", "codellama"]
    results = {}

    for model in required_models:
    results[model] = ensure_model(model)

    return {
    "available_models": [model['name'] for model in models['models']],
    "pull_results": results
    }

Best Practices

Resource Management

import psutil
from klira.sdk.decorators import workflow
from klira.sdk.utils.context import set_hierarchy_context

# Set user context
set_hierarchy_context(user_id="user_123")

@workflow(name="resource_aware_ollama")
def resource_aware_ollama_completion(message: str) -> dict:
    """Ollama completion with resource monitoring."""

    cpu_percent = psutil.cpu_percent(interval=1)
    memory_percent = psutil.virtual_memory().percent

    if cpu_percent > 90 or memory_percent > 85:
    return {
    "success": False,
    "error": "system_overloaded",
    "message": f"System resources too high: CPU {cpu_percent}%, Memory {memory_percent}%"
    }

    response = ollama.chat(
    model="llama2",
    messages=[{"role": "user", "content": message}]
    )

    return {
    "success": True,
    "content": response['message']['content']
    }

Server Health Check

import requests

def check_ollama_server() -> bool:
    """Check if Ollama server is running."""
    try:
    response = requests.get("http://localhost:11434/api/version", timeout=5)
    return response.status_code == 200
    except:
    return False

OpenAI Integration - OpenAI API integration
Anthropic Integration - Claude API integration
Custom LLM Providers - Adding new LLM support