Initial commit: Ollama MCP server

MCP server exposing local Ollama models via LiteLLM proxy to Claude Code. Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models. Deployed to k8s ai-inference namespace via ArgoCD. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:33:56 +00:00
commit 139a038505
6 changed files with 548 additions and 0 deletions
--- a/src/server.py
+++ b/src/server.py
@@ -0,0 +1,321 @@
+"""
+Ollama MCP Server
+Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
+Runs as an HTTP/SSE MCP server in Kubernetes.
+"""
+
+import os
+import httpx
+import logging
+from mcp.server import Server
+from mcp.server.sse import SseServerTransport
+from mcp.types import Tool, TextContent
+from starlette.applications import Starlette
+from starlette.routing import Route, Mount
+from starlette.requests import Request
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
+LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
+REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
+
+# Model aliases exposed to Claude Code
+MODELS = {
+    "deepseek-coder": "local/deepseek-coder",   # deepseek-coder-v2:16b - code tasks
+    "llama3": "local/llama3",                    # llama3.1:8b - general tasks
+    "codellama": "local/codellama",              # codellama:13b - code generation
+    "qwen-coder": "local/qwen-coder",            # qwen2.5-coder:14b - code tasks
+}
+
+DEFAULT_CODE_MODEL = "local/qwen-coder"
+DEFAULT_GENERAL_MODEL = "local/llama3"
+
+app = Server("ollama-mcp")
+
+
+async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
+    """Call LiteLLM proxy with the given model and messages."""
+    headers = {"Content-Type": "application/json"}
+    if LITELLM_API_KEY:
+        headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
+
+    payload = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "stream": False,
+    }
+
+    async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+        response = await client.post(
+            f"{LITELLM_BASE_URL}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
+
+
+@app.list_tools()
+async def list_tools() -> list[Tool]:
+    return [
+        Tool(
+            name="query_local_model",
+            description=(
+                "Send a prompt to a local Ollama model via LiteLLM. "
+                "Use for tasks that don't require Claude's reasoning: summarization, "
+                "formatting, simple code generation, boilerplate, regex, quick lookups. "
+                f"Available models: {', '.join(MODELS.keys())}. "
+                "Defaults to qwen-coder for code tasks, llama3 for general."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "prompt": {
+                        "type": "string",
+                        "description": "The prompt to send to the model.",
+                    },
+                    "model": {
+                        "type": "string",
+                        "enum": list(MODELS.keys()),
+                        "description": "Model to use. Omit to auto-select based on task type.",
+                    },
+                    "task_type": {
+                        "type": "string",
+                        "enum": ["code", "general"],
+                        "description": "Task type for auto model selection when model is not specified.",
+                        "default": "general",
+                    },
+                    "system_prompt": {
+                        "type": "string",
+                        "description": "Optional system prompt to guide model behavior.",
+                    },
+                    "max_tokens": {
+                        "type": "integer",
+                        "description": "Maximum tokens in response (default: 2048).",
+                        "default": 2048,
+                    },
+                },
+                "required": ["prompt"],
+            },
+        ),
+        Tool(
+            name="review_code",
+            description=(
+                "Review code for bugs, style issues, and improvements using a local code model. "
+                "Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": "The code to review.",
+                    },
+                    "language": {
+                        "type": "string",
+                        "description": "Programming language (e.g. python, go, typescript).",
+                    },
+                    "focus": {
+                        "type": "string",
+                        "description": "Optional focus area: bugs, style, security, performance, tests.",
+                    },
+                },
+                "required": ["code"],
+            },
+        ),
+        Tool(
+            name="summarize",
+            description=(
+                "Summarize text, logs, or documentation using a local model. "
+                "Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "Text to summarize.",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["bullet_points", "paragraph", "key_facts"],
+                        "description": "Output format for the summary.",
+                        "default": "bullet_points",
+                    },
+                    "max_length": {
+                        "type": "string",
+                        "enum": ["short", "medium", "detailed"],
+                        "description": "Desired summary length.",
+                        "default": "medium",
+                    },
+                },
+                "required": ["text"],
+            },
+        ),
+        Tool(
+            name="generate_boilerplate",
+            description=(
+                "Generate boilerplate code, configs, or scaffolding using a local code model. "
+                "Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "description": {
+                        "type": "string",
+                        "description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
+                    },
+                    "context": {
+                        "type": "string",
+                        "description": "Optional additional context or constraints.",
+                    },
+                },
+                "required": ["description"],
+            },
+        ),
+        Tool(
+            name="list_models",
+            description="List available local Ollama models and their use cases.",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+            },
+        ),
+    ]
+
+
+@app.call_tool()
+async def call_tool(name: str, arguments: dict) -> list[TextContent]:
+    try:
+        if name == "list_models":
+            model_info = "\n".join([
+                "Available local models (routed via LiteLLM):",
+                "- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
+                "- llama3: llama3.1:8b — general tasks, summarization, Q&A",
+                "- codellama: codellama:13b — code generation, completion",
+                "- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
+                "",
+                f"LiteLLM endpoint: {LITELLM_BASE_URL}",
+            ])
+            return [TextContent(type="text", text=model_info)]
+
+        elif name == "query_local_model":
+            prompt = arguments["prompt"]
+            task_type = arguments.get("task_type", "general")
+            system_prompt = arguments.get("system_prompt")
+            max_tokens = arguments.get("max_tokens", 2048)
+
+            if "model" in arguments:
+                model = MODELS[arguments["model"]]
+            elif task_type == "code":
+                model = DEFAULT_CODE_MODEL
+            else:
+                model = DEFAULT_GENERAL_MODEL
+
+            messages = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": prompt})
+
+            result = await call_litellm(model, messages, max_tokens)
+            return [TextContent(type="text", text=result)]
+
+        elif name == "review_code":
+            code = arguments["code"]
+            language = arguments.get("language", "")
+            focus = arguments.get("focus", "general code quality")
+
+            lang_str = f" {language}" if language else ""
+            system = (
+                f"You are an expert{lang_str} code reviewer. "
+                "Be concise and actionable. Flag actual issues, not style preferences unless asked. "
+                "Format: list issues with severity (critical/warning/info), then suggestions."
+            )
+            prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
+
+            result = await call_litellm("local/deepseek-coder", [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        elif name == "summarize":
+            text = arguments["text"]
+            fmt = arguments.get("format", "bullet_points")
+            length = arguments.get("max_length", "medium")
+
+            length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
+            fmt_guide = {
+                "bullet_points": "Use bullet points.",
+                "paragraph": "Write in paragraph form.",
+                "key_facts": "Extract only key facts and numbers.",
+            }
+
+            system = "You are a concise technical summarizer. Extract the most important information."
+            prompt = (
+                f"Summarize the following text. {fmt_guide[fmt]} "
+                f"Length: {length_guide[length]}.\n\n{text}"
+            )
+
+            result = await call_litellm(DEFAULT_GENERAL_MODEL, [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        elif name == "generate_boilerplate":
+            description = arguments["description"]
+            context = arguments.get("context", "")
+
+            system = (
+                "You are an expert DevOps and software engineer. "
+                "Generate clean, production-ready boilerplate. "
+                "Output only the file content with no explanation unless asked."
+            )
+            prompt = f"Generate: {description}"
+            if context:
+                prompt += f"\n\nContext/constraints:\n{context}"
+
+            result = await call_litellm(DEFAULT_CODE_MODEL, [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        else:
+            return [TextContent(type="text", text=f"Unknown tool: {name}")]
+
+    except httpx.HTTPStatusError as e:
+        logger.error("LiteLLM HTTP error: %s", e)
+        return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
+    except Exception as e:
+        logger.error("Tool error: %s", e)
+        return [TextContent(type="text", text=f"Error: {str(e)}")]
+
+
+def create_starlette_app() -> Starlette:
+    sse = SseServerTransport("/messages/")
+
+    async def handle_sse(request: Request):
+        async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
+            await app.run(streams[0], streams[1], app.create_initialization_options())
+
+    return Starlette(
+        routes=[
+            Route("/sse", endpoint=handle_sse),
+            Mount("/messages/", app=sse.handle_post_message),
+            Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
+        ]
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", "8090"))
+    logger.info("Starting Ollama MCP server on port %d", port)
+    logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
+    uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)