Initial commit: Ollama MCP server
MCP server exposing local Ollama models via LiteLLM proxy to Claude Code. Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models. Deployed to k8s ai-inference namespace via ArgoCD. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
321
src/server.py
Normal file
321
src/server.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Ollama MCP Server
|
||||
Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
|
||||
Runs as an HTTP/SSE MCP server in Kubernetes.
|
||||
"""
|
||||
|
||||
import os
|
||||
import httpx
|
||||
import logging
|
||||
from mcp.server import Server
|
||||
from mcp.server.sse import SseServerTransport
|
||||
from mcp.types import Tool, TextContent
|
||||
from starlette.applications import Starlette
|
||||
from starlette.routing import Route, Mount
|
||||
from starlette.requests import Request
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
|
||||
LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
|
||||
REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
|
||||
|
||||
# Model aliases exposed to Claude Code
|
||||
MODELS = {
|
||||
"deepseek-coder": "local/deepseek-coder", # deepseek-coder-v2:16b - code tasks
|
||||
"llama3": "local/llama3", # llama3.1:8b - general tasks
|
||||
"codellama": "local/codellama", # codellama:13b - code generation
|
||||
"qwen-coder": "local/qwen-coder", # qwen2.5-coder:14b - code tasks
|
||||
}
|
||||
|
||||
DEFAULT_CODE_MODEL = "local/qwen-coder"
|
||||
DEFAULT_GENERAL_MODEL = "local/llama3"
|
||||
|
||||
app = Server("ollama-mcp")
|
||||
|
||||
|
||||
async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
|
||||
"""Call LiteLLM proxy with the given model and messages."""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if LITELLM_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
||||
response = await client.post(
|
||||
f"{LITELLM_BASE_URL}/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
@app.list_tools()
|
||||
async def list_tools() -> list[Tool]:
|
||||
return [
|
||||
Tool(
|
||||
name="query_local_model",
|
||||
description=(
|
||||
"Send a prompt to a local Ollama model via LiteLLM. "
|
||||
"Use for tasks that don't require Claude's reasoning: summarization, "
|
||||
"formatting, simple code generation, boilerplate, regex, quick lookups. "
|
||||
f"Available models: {', '.join(MODELS.keys())}. "
|
||||
"Defaults to qwen-coder for code tasks, llama3 for general."
|
||||
),
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The prompt to send to the model.",
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"enum": list(MODELS.keys()),
|
||||
"description": "Model to use. Omit to auto-select based on task type.",
|
||||
},
|
||||
"task_type": {
|
||||
"type": "string",
|
||||
"enum": ["code", "general"],
|
||||
"description": "Task type for auto model selection when model is not specified.",
|
||||
"default": "general",
|
||||
},
|
||||
"system_prompt": {
|
||||
"type": "string",
|
||||
"description": "Optional system prompt to guide model behavior.",
|
||||
},
|
||||
"max_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Maximum tokens in response (default: 2048).",
|
||||
"default": 2048,
|
||||
},
|
||||
},
|
||||
"required": ["prompt"],
|
||||
},
|
||||
),
|
||||
Tool(
|
||||
name="review_code",
|
||||
description=(
|
||||
"Review code for bugs, style issues, and improvements using a local code model. "
|
||||
"Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
|
||||
),
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"code": {
|
||||
"type": "string",
|
||||
"description": "The code to review.",
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"description": "Programming language (e.g. python, go, typescript).",
|
||||
},
|
||||
"focus": {
|
||||
"type": "string",
|
||||
"description": "Optional focus area: bugs, style, security, performance, tests.",
|
||||
},
|
||||
},
|
||||
"required": ["code"],
|
||||
},
|
||||
),
|
||||
Tool(
|
||||
name="summarize",
|
||||
description=(
|
||||
"Summarize text, logs, or documentation using a local model. "
|
||||
"Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
|
||||
),
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to summarize.",
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["bullet_points", "paragraph", "key_facts"],
|
||||
"description": "Output format for the summary.",
|
||||
"default": "bullet_points",
|
||||
},
|
||||
"max_length": {
|
||||
"type": "string",
|
||||
"enum": ["short", "medium", "detailed"],
|
||||
"description": "Desired summary length.",
|
||||
"default": "medium",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
),
|
||||
Tool(
|
||||
name="generate_boilerplate",
|
||||
description=(
|
||||
"Generate boilerplate code, configs, or scaffolding using a local code model. "
|
||||
"Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
|
||||
),
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
|
||||
},
|
||||
"context": {
|
||||
"type": "string",
|
||||
"description": "Optional additional context or constraints.",
|
||||
},
|
||||
},
|
||||
"required": ["description"],
|
||||
},
|
||||
),
|
||||
Tool(
|
||||
name="list_models",
|
||||
description="List available local Ollama models and their use cases.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@app.call_tool()
|
||||
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
|
||||
try:
|
||||
if name == "list_models":
|
||||
model_info = "\n".join([
|
||||
"Available local models (routed via LiteLLM):",
|
||||
"- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
|
||||
"- llama3: llama3.1:8b — general tasks, summarization, Q&A",
|
||||
"- codellama: codellama:13b — code generation, completion",
|
||||
"- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
|
||||
"",
|
||||
f"LiteLLM endpoint: {LITELLM_BASE_URL}",
|
||||
])
|
||||
return [TextContent(type="text", text=model_info)]
|
||||
|
||||
elif name == "query_local_model":
|
||||
prompt = arguments["prompt"]
|
||||
task_type = arguments.get("task_type", "general")
|
||||
system_prompt = arguments.get("system_prompt")
|
||||
max_tokens = arguments.get("max_tokens", 2048)
|
||||
|
||||
if "model" in arguments:
|
||||
model = MODELS[arguments["model"]]
|
||||
elif task_type == "code":
|
||||
model = DEFAULT_CODE_MODEL
|
||||
else:
|
||||
model = DEFAULT_GENERAL_MODEL
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
result = await call_litellm(model, messages, max_tokens)
|
||||
return [TextContent(type="text", text=result)]
|
||||
|
||||
elif name == "review_code":
|
||||
code = arguments["code"]
|
||||
language = arguments.get("language", "")
|
||||
focus = arguments.get("focus", "general code quality")
|
||||
|
||||
lang_str = f" {language}" if language else ""
|
||||
system = (
|
||||
f"You are an expert{lang_str} code reviewer. "
|
||||
"Be concise and actionable. Flag actual issues, not style preferences unless asked. "
|
||||
"Format: list issues with severity (critical/warning/info), then suggestions."
|
||||
)
|
||||
prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
|
||||
|
||||
result = await call_litellm("local/deepseek-coder", [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": prompt},
|
||||
])
|
||||
return [TextContent(type="text", text=result)]
|
||||
|
||||
elif name == "summarize":
|
||||
text = arguments["text"]
|
||||
fmt = arguments.get("format", "bullet_points")
|
||||
length = arguments.get("max_length", "medium")
|
||||
|
||||
length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
|
||||
fmt_guide = {
|
||||
"bullet_points": "Use bullet points.",
|
||||
"paragraph": "Write in paragraph form.",
|
||||
"key_facts": "Extract only key facts and numbers.",
|
||||
}
|
||||
|
||||
system = "You are a concise technical summarizer. Extract the most important information."
|
||||
prompt = (
|
||||
f"Summarize the following text. {fmt_guide[fmt]} "
|
||||
f"Length: {length_guide[length]}.\n\n{text}"
|
||||
)
|
||||
|
||||
result = await call_litellm(DEFAULT_GENERAL_MODEL, [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": prompt},
|
||||
])
|
||||
return [TextContent(type="text", text=result)]
|
||||
|
||||
elif name == "generate_boilerplate":
|
||||
description = arguments["description"]
|
||||
context = arguments.get("context", "")
|
||||
|
||||
system = (
|
||||
"You are an expert DevOps and software engineer. "
|
||||
"Generate clean, production-ready boilerplate. "
|
||||
"Output only the file content with no explanation unless asked."
|
||||
)
|
||||
prompt = f"Generate: {description}"
|
||||
if context:
|
||||
prompt += f"\n\nContext/constraints:\n{context}"
|
||||
|
||||
result = await call_litellm(DEFAULT_CODE_MODEL, [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": prompt},
|
||||
])
|
||||
return [TextContent(type="text", text=result)]
|
||||
|
||||
else:
|
||||
return [TextContent(type="text", text=f"Unknown tool: {name}")]
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error("LiteLLM HTTP error: %s", e)
|
||||
return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
|
||||
except Exception as e:
|
||||
logger.error("Tool error: %s", e)
|
||||
return [TextContent(type="text", text=f"Error: {str(e)}")]
|
||||
|
||||
|
||||
def create_starlette_app() -> Starlette:
|
||||
sse = SseServerTransport("/messages/")
|
||||
|
||||
async def handle_sse(request: Request):
|
||||
async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
|
||||
await app.run(streams[0], streams[1], app.create_initialization_options())
|
||||
|
||||
return Starlette(
|
||||
routes=[
|
||||
Route("/sse", endpoint=handle_sse),
|
||||
Mount("/messages/", app=sse.handle_post_message),
|
||||
Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
port = int(os.environ.get("PORT", "8090"))
|
||||
logger.info("Starting Ollama MCP server on port %d", port)
|
||||
logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
|
||||
uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)
|
||||
Reference in New Issue
Block a user