Initial commit: Ollama MCP server
Some checks failed
Build and Deploy / build-push (push) Has been cancelled
Build and Deploy / deploy (push) Has been cancelled

MCP server exposing local Ollama models via LiteLLM proxy to Claude Code.
Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models.
Deployed to k8s ai-inference namespace via ArgoCD.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 17:33:56 +00:00
commit 139a038505
6 changed files with 548 additions and 0 deletions

321
src/server.py Normal file
View File

@@ -0,0 +1,321 @@
"""
Ollama MCP Server
Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
Runs as an HTTP/SSE MCP server in Kubernetes.
"""
import os
import httpx
import logging
from mcp.server import Server
from mcp.server.sse import SseServerTransport
from mcp.types import Tool, TextContent
from starlette.applications import Starlette
from starlette.routing import Route, Mount
from starlette.requests import Request
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
# Model aliases exposed to Claude Code
MODELS = {
"deepseek-coder": "local/deepseek-coder", # deepseek-coder-v2:16b - code tasks
"llama3": "local/llama3", # llama3.1:8b - general tasks
"codellama": "local/codellama", # codellama:13b - code generation
"qwen-coder": "local/qwen-coder", # qwen2.5-coder:14b - code tasks
}
DEFAULT_CODE_MODEL = "local/qwen-coder"
DEFAULT_GENERAL_MODEL = "local/llama3"
app = Server("ollama-mcp")
async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
"""Call LiteLLM proxy with the given model and messages."""
headers = {"Content-Type": "application/json"}
if LITELLM_API_KEY:
headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"stream": False,
}
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
response = await client.post(
f"{LITELLM_BASE_URL}/v1/chat/completions",
headers=headers,
json=payload,
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
@app.list_tools()
async def list_tools() -> list[Tool]:
return [
Tool(
name="query_local_model",
description=(
"Send a prompt to a local Ollama model via LiteLLM. "
"Use for tasks that don't require Claude's reasoning: summarization, "
"formatting, simple code generation, boilerplate, regex, quick lookups. "
f"Available models: {', '.join(MODELS.keys())}. "
"Defaults to qwen-coder for code tasks, llama3 for general."
),
inputSchema={
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The prompt to send to the model.",
},
"model": {
"type": "string",
"enum": list(MODELS.keys()),
"description": "Model to use. Omit to auto-select based on task type.",
},
"task_type": {
"type": "string",
"enum": ["code", "general"],
"description": "Task type for auto model selection when model is not specified.",
"default": "general",
},
"system_prompt": {
"type": "string",
"description": "Optional system prompt to guide model behavior.",
},
"max_tokens": {
"type": "integer",
"description": "Maximum tokens in response (default: 2048).",
"default": 2048,
},
},
"required": ["prompt"],
},
),
Tool(
name="review_code",
description=(
"Review code for bugs, style issues, and improvements using a local code model. "
"Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
),
inputSchema={
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "The code to review.",
},
"language": {
"type": "string",
"description": "Programming language (e.g. python, go, typescript).",
},
"focus": {
"type": "string",
"description": "Optional focus area: bugs, style, security, performance, tests.",
},
},
"required": ["code"],
},
),
Tool(
name="summarize",
description=(
"Summarize text, logs, or documentation using a local model. "
"Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
),
inputSchema={
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to summarize.",
},
"format": {
"type": "string",
"enum": ["bullet_points", "paragraph", "key_facts"],
"description": "Output format for the summary.",
"default": "bullet_points",
},
"max_length": {
"type": "string",
"enum": ["short", "medium", "detailed"],
"description": "Desired summary length.",
"default": "medium",
},
},
"required": ["text"],
},
),
Tool(
name="generate_boilerplate",
description=(
"Generate boilerplate code, configs, or scaffolding using a local code model. "
"Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
),
inputSchema={
"type": "object",
"properties": {
"description": {
"type": "string",
"description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
},
"context": {
"type": "string",
"description": "Optional additional context or constraints.",
},
},
"required": ["description"],
},
),
Tool(
name="list_models",
description="List available local Ollama models and their use cases.",
inputSchema={
"type": "object",
"properties": {},
},
),
]
@app.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
try:
if name == "list_models":
model_info = "\n".join([
"Available local models (routed via LiteLLM):",
"- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
"- llama3: llama3.1:8b — general tasks, summarization, Q&A",
"- codellama: codellama:13b — code generation, completion",
"- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
"",
f"LiteLLM endpoint: {LITELLM_BASE_URL}",
])
return [TextContent(type="text", text=model_info)]
elif name == "query_local_model":
prompt = arguments["prompt"]
task_type = arguments.get("task_type", "general")
system_prompt = arguments.get("system_prompt")
max_tokens = arguments.get("max_tokens", 2048)
if "model" in arguments:
model = MODELS[arguments["model"]]
elif task_type == "code":
model = DEFAULT_CODE_MODEL
else:
model = DEFAULT_GENERAL_MODEL
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
result = await call_litellm(model, messages, max_tokens)
return [TextContent(type="text", text=result)]
elif name == "review_code":
code = arguments["code"]
language = arguments.get("language", "")
focus = arguments.get("focus", "general code quality")
lang_str = f" {language}" if language else ""
system = (
f"You are an expert{lang_str} code reviewer. "
"Be concise and actionable. Flag actual issues, not style preferences unless asked. "
"Format: list issues with severity (critical/warning/info), then suggestions."
)
prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
result = await call_litellm("local/deepseek-coder", [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
elif name == "summarize":
text = arguments["text"]
fmt = arguments.get("format", "bullet_points")
length = arguments.get("max_length", "medium")
length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
fmt_guide = {
"bullet_points": "Use bullet points.",
"paragraph": "Write in paragraph form.",
"key_facts": "Extract only key facts and numbers.",
}
system = "You are a concise technical summarizer. Extract the most important information."
prompt = (
f"Summarize the following text. {fmt_guide[fmt]} "
f"Length: {length_guide[length]}.\n\n{text}"
)
result = await call_litellm(DEFAULT_GENERAL_MODEL, [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
elif name == "generate_boilerplate":
description = arguments["description"]
context = arguments.get("context", "")
system = (
"You are an expert DevOps and software engineer. "
"Generate clean, production-ready boilerplate. "
"Output only the file content with no explanation unless asked."
)
prompt = f"Generate: {description}"
if context:
prompt += f"\n\nContext/constraints:\n{context}"
result = await call_litellm(DEFAULT_CODE_MODEL, [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
])
return [TextContent(type="text", text=result)]
else:
return [TextContent(type="text", text=f"Unknown tool: {name}")]
except httpx.HTTPStatusError as e:
logger.error("LiteLLM HTTP error: %s", e)
return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
except Exception as e:
logger.error("Tool error: %s", e)
return [TextContent(type="text", text=f"Error: {str(e)}")]
def create_starlette_app() -> Starlette:
sse = SseServerTransport("/messages/")
async def handle_sse(request: Request):
async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
await app.run(streams[0], streams[1], app.create_initialization_options())
return Starlette(
routes=[
Route("/sse", endpoint=handle_sse),
Mount("/messages/", app=sse.handle_post_message),
Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
]
)
if __name__ == "__main__":
import uvicorn
port = int(os.environ.get("PORT", "8090"))
logger.info("Starting Ollama MCP server on port %d", port)
logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)