Initial commit: Ollama MCP server
Some checks failed
Build and Deploy / build-push (push) Has been cancelled
Build and Deploy / deploy (push) Has been cancelled

MCP server exposing local Ollama models via LiteLLM proxy to Claude Code.
Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models.
Deployed to k8s ai-inference namespace via ArgoCD.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 17:33:56 +00:00
commit 139a038505
6 changed files with 548 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
name: Build and Deploy
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
build-push:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Build and push Docker image
run: |
docker login registry.storedbox.net \
-u ${{ secrets.DOCKER_USER }} \
-p ${{ secrets.DOCKER_PASSWORD }}
docker build -t registry.storedbox.net/ollama-mcp:${{ github.sha }} .
docker tag registry.storedbox.net/ollama-mcp:${{ github.sha }} \
registry.storedbox.net/ollama-mcp:latest
docker push registry.storedbox.net/ollama-mcp:${{ github.sha }}
docker push registry.storedbox.net/ollama-mcp:latest
deploy:
runs-on: ubuntu-latest
needs: build-push
if: github.ref == 'refs/heads/main'
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up kubectl
uses: azure/setup-kubectl@v3
- name: Configure kubectl
run: |
echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig
export KUBECONFIG=kubeconfig
- name: Apply k8s manifests
run: |
kubectl apply -f k8s/deployment.yaml --kubeconfig=kubeconfig
- name: Rollout restart to pull latest image
run: |
kubectl rollout restart deployment/ollama-mcp \
-n ai-inference --kubeconfig=kubeconfig
kubectl rollout status deployment/ollama-mcp \
-n ai-inference --kubeconfig=kubeconfig --timeout=120s
- name: Health check
run: |
sleep 10
MCP_IP=$(kubectl get svc ollama-mcp -n ai-inference \
-o jsonpath='{.status.loadBalancer.ingress[0].ip}' \
--kubeconfig=kubeconfig)
curl -f http://${MCP_IP}:8090/health || exit 1
echo "MCP server healthy at http://${MCP_IP}:8090"

20
Dockerfile Normal file
View File

@@ -0,0 +1,20 @@
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY src/ ./src/
ENV PYTHONUNBUFFERED=1 \
PORT=8090 \
LITELLM_BASE_URL=http://litellm.ai-inference.svc:4000 \
REQUEST_TIMEOUT=120
EXPOSE 8090
HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8090/health')"
CMD ["python", "src/server.py"]

43
k8s/argocd-app.yaml Normal file
View File

@@ -0,0 +1,43 @@
---
# ArgoCD Application - deploy this once to bootstrap:
# kubectl apply -f k8s/argocd-app.yaml
#
# Pre-requisite: Add the repo to ArgoCD first:
# argocd repo add https://repo.adservio.us/ai_approver/ollama-mcp.git \
# --username <gitea-user> --password <gitea-token>
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: ollama-mcp
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: https://repo.adservio.us/ai_approver/ollama-mcp.git
targetRevision: main
path: k8s
directory:
exclude: argocd-app.yaml
destination:
server: https://kubernetes.default.svc
namespace: ai-inference
syncPolicy:
automated:
prune: true
selfHeal: true
allowEmpty: false
syncOptions:
- CreateNamespace=true
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
ignoreDifferences:
- group: apps
kind: Deployment
jsonPointers:
- /spec/replicas

95
k8s/deployment.yaml Normal file
View File

@@ -0,0 +1,95 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama-mcp
namespace: ai-inference
labels:
app: ollama-mcp
annotations:
argocd.argoproj.io/sync-wave: "10"
spec:
replicas: 1
selector:
matchLabels:
app: ollama-mcp
template:
metadata:
labels:
app: ollama-mcp
spec:
containers:
- name: ollama-mcp
image: registry.storedbox.net/ollama-mcp:latest
ports:
- containerPort: 8090
name: http
protocol: TCP
env:
- name: LITELLM_BASE_URL
value: "http://litellm.ai-inference.svc:4000"
- name: LITELLM_API_KEY
valueFrom:
secretKeyRef:
name: ollama-mcp-secrets
key: LITELLM_API_KEY
optional: true
- name: PORT
value: "8090"
- name: REQUEST_TIMEOUT
value: "120"
livenessProbe:
httpGet:
path: /health
port: 8090
initialDelaySeconds: 15
periodSeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /health
port: 8090
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- k3s-control-2
- k3s-worker-3
- k3s-worker-4
---
apiVersion: v1
kind: Service
metadata:
name: ollama-mcp
namespace: ai-inference
labels:
app: ollama-mcp
annotations:
metallb.universe.tf/loadBalancerIPs: "192.168.87.29"
spec:
type: LoadBalancer
ports:
- port: 8090
targetPort: 8090
protocol: TCP
name: http
selector:
app: ollama-mcp
---
# Create the LiteLLM API key secret before deploying:
# kubectl create secret generic ollama-mcp-secrets -n ai-inference \
# --from-literal=LITELLM_API_KEY=a699d6c80639dcf56d5fb8f2a99e50d220b5189dcc2fa1fdc8ccee4dab4df77e

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
mcp>=1.0.0
httpx>=0.27.0
starlette>=0.41.0
uvicorn[standard]>=0.32.0

321
src/server.py Normal file
View File

@@ -0,0 +1,321 @@
"""
Ollama MCP Server
Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
Runs as an HTTP/SSE MCP server in Kubernetes.
"""
import os
import httpx
import logging
from mcp.server import Server
from mcp.server.sse import SseServerTransport
from mcp.types import Tool, TextContent
from starlette.applications import Starlette
from starlette.routing import Route, Mount
from starlette.requests import Request
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
# Model aliases exposed to Claude Code
MODELS = {
"deepseek-coder": "local/deepseek-coder", # deepseek-coder-v2:16b - code tasks
"llama3": "local/llama3", # llama3.1:8b - general tasks
"codellama": "local/codellama", # codellama:13b - code generation
"qwen-coder": "local/qwen-coder", # qwen2.5-coder:14b - code tasks
}
DEFAULT_CODE_MODEL = "local/qwen-coder"
DEFAULT_GENERAL_MODEL = "local/llama3"
app = Server("ollama-mcp")
async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
"""Call LiteLLM proxy with the given model and messages."""
headers = {"Content-Type": "application/json"}
if LITELLM_API_KEY:
headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"stream": False,
}
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
response = await client.post(
f"{LITELLM_BASE_URL}/v1/chat/completions",
headers=headers,
json=payload,
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
@app.list_tools()
async def list_tools() -> list[Tool]:
return [
Tool(
name="query_local_model",
description=(
"Send a prompt to a local Ollama model via LiteLLM. "
"Use for tasks that don't require Claude's reasoning: summarization, "
"formatting, simple code generation, boilerplate, regex, quick lookups. "
f"Available models: {', '.join(MODELS.keys())}. "
"Defaults to qwen-coder for code tasks, llama3 for general."
),
inputSchema={
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The prompt to send to the model.",
},
"model": {
"type": "string",
"enum": list(MODELS.keys()),
"description": "Model to use. Omit to auto-select based on task type.",
},
"task_type": {
"type": "string",
"enum": ["code", "general"],
"description": "Task type for auto model selection when model is not specified.",
"default": "general",
},
"system_prompt": {
"type": "string",
"description": "Optional system prompt to guide model behavior.",
},
"max_tokens": {
"type": "integer",
"description": "Maximum tokens in response (default: 2048).",
"default": 2048,
},
},
"required": ["prompt"],
},
),
Tool(
name="review_code",
description=(
"Review code for bugs, style issues, and improvements using a local code model. "
"Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
),
inputSchema={
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "The code to review.",
},
"language": {
"type": "string",
"description": "Programming language (e.g. python, go, typescript).",
},
"focus": {
"type": "string",
"description": "Optional focus area: bugs, style, security, performance, tests.",
},
},
"required": ["code"],
},
),
Tool(
name="summarize",
description=(
"Summarize text, logs, or documentation using a local model. "
"Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
),
inputSchema={
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to summarize.",
},
"format": {
"type": "string",
"enum": ["bullet_points", "paragraph", "key_facts"],
"description": "Output format for the summary.",
"default": "bullet_points",
},
"max_length": {
"type": "string",
"enum": ["short", "medium", "detailed"],
"description": "Desired summary length.",
"default": "medium",
},
},
"required": ["text"],
},
),
Tool(
name="generate_boilerplate",
description=(
"Generate boilerplate code, configs, or scaffolding using a local code model. "
"Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
),
inputSchema={
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
},
"context": {
"type": "string",
"description": "Optional additional context or constraints.",
},
},
"required": ["description"],
},
),
Tool(
name="list_models",
description="List available local Ollama models and their use cases.",
inputSchema={
"type": "object",
"properties": {},
},
),
]
@app.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
try:
if name == "list_models":
model_info = "\n".join([
"Available local models (routed via LiteLLM):",
"- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
"- llama3: llama3.1:8b — general tasks, summarization, Q&A",
"- codellama: codellama:13b — code generation, completion",
"- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
"",
f"LiteLLM endpoint: {LITELLM_BASE_URL}",
])
return [TextContent(type="text", text=model_info)]
elif name == "query_local_model":
prompt = arguments["prompt"]
task_type = arguments.get("task_type", "general")
system_prompt = arguments.get("system_prompt")
max_tokens = arguments.get("max_tokens", 2048)
if "model" in arguments:
model = MODELS[arguments["model"]]
elif task_type == "code":
model = DEFAULT_CODE_MODEL
else:
model = DEFAULT_GENERAL_MODEL
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
result = await call_litellm(model, messages, max_tokens)
return [TextContent(type="text", text=result)]
elif name == "review_code":
code = arguments["code"]
language = arguments.get("language", "")
focus = arguments.get("focus", "general code quality")
lang_str = f" {language}" if language else ""
system = (
f"You are an expert{lang_str} code reviewer. "
"Be concise and actionable. Flag actual issues, not style preferences unless asked. "
"Format: list issues with severity (critical/warning/info), then suggestions."
)
prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
result = await call_litellm("local/deepseek-coder", [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
elif name == "summarize":
text = arguments["text"]
fmt = arguments.get("format", "bullet_points")
length = arguments.get("max_length", "medium")
length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
fmt_guide = {
"bullet_points": "Use bullet points.",
"paragraph": "Write in paragraph form.",
"key_facts": "Extract only key facts and numbers.",
}
system = "You are a concise technical summarizer. Extract the most important information."
prompt = (
f"Summarize the following text. {fmt_guide[fmt]} "
f"Length: {length_guide[length]}.\n\n{text}"
)
result = await call_litellm(DEFAULT_GENERAL_MODEL, [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
elif name == "generate_boilerplate":
description = arguments["description"]
context = arguments.get("context", "")
system = (
"You are an expert DevOps and software engineer. "
"Generate clean, production-ready boilerplate. "
"Output only the file content with no explanation unless asked."
)
prompt = f"Generate: {description}"
if context:
prompt += f"\n\nContext/constraints:\n{context}"
result = await call_litellm(DEFAULT_CODE_MODEL, [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
else:
return [TextContent(type="text", text=f"Unknown tool: {name}")]
except httpx.HTTPStatusError as e:
logger.error("LiteLLM HTTP error: %s", e)
return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
except Exception as e:
logger.error("Tool error: %s", e)
return [TextContent(type="text", text=f"Error: {str(e)}")]
def create_starlette_app() -> Starlette:
sse = SseServerTransport("/messages/")
async def handle_sse(request: Request):
async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
await app.run(streams[0], streams[1], app.create_initialization_options())
return Starlette(
routes=[
Route("/sse", endpoint=handle_sse),
Mount("/messages/", app=sse.handle_post_message),
Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
]
)
if __name__ == "__main__":
import uvicorn
port = int(os.environ.get("PORT", "8090"))
logger.info("Starting Ollama MCP server on port %d", port)
logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)