commit 139a038505b296351b42ab0a8d4926dd0c9ac57c Author: ai_approver Date: Sat Mar 21 17:33:56 2026 +0000 Initial commit: Ollama MCP server MCP server exposing local Ollama models via LiteLLM proxy to Claude Code. Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models. Deployed to k8s ai-inference namespace via ArgoCD. Co-Authored-By: Claude Sonnet 4.6 diff --git a/.gitea/workflows/build-deploy.yml b/.gitea/workflows/build-deploy.yml new file mode 100644 index 0000000..531b58e --- /dev/null +++ b/.gitea/workflows/build-deploy.yml @@ -0,0 +1,65 @@ +name: Build and Deploy + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build-push: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build and push Docker image + run: | + docker login registry.storedbox.net \ + -u ${{ secrets.DOCKER_USER }} \ + -p ${{ secrets.DOCKER_PASSWORD }} + + docker build -t registry.storedbox.net/ollama-mcp:${{ github.sha }} . + docker tag registry.storedbox.net/ollama-mcp:${{ github.sha }} \ + registry.storedbox.net/ollama-mcp:latest + + docker push registry.storedbox.net/ollama-mcp:${{ github.sha }} + docker push registry.storedbox.net/ollama-mcp:latest + + deploy: + runs-on: ubuntu-latest + needs: build-push + if: github.ref == 'refs/heads/main' + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up kubectl + uses: azure/setup-kubectl@v3 + + - name: Configure kubectl + run: | + echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig + export KUBECONFIG=kubeconfig + + - name: Apply k8s manifests + run: | + kubectl apply -f k8s/deployment.yaml --kubeconfig=kubeconfig + + - name: Rollout restart to pull latest image + run: | + kubectl rollout restart deployment/ollama-mcp \ + -n ai-inference --kubeconfig=kubeconfig + kubectl rollout status deployment/ollama-mcp \ + -n ai-inference --kubeconfig=kubeconfig --timeout=120s + + - name: Health check + run: | + sleep 10 + MCP_IP=$(kubectl get svc ollama-mcp -n ai-inference \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' \ + --kubeconfig=kubeconfig) + curl -f http://${MCP_IP}:8090/health || exit 1 + echo "MCP server healthy at http://${MCP_IP}:8090" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..715ad73 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/ ./src/ + +ENV PYTHONUNBUFFERED=1 \ + PORT=8090 \ + LITELLM_BASE_URL=http://litellm.ai-inference.svc:4000 \ + REQUEST_TIMEOUT=120 + +EXPOSE 8090 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8090/health')" + +CMD ["python", "src/server.py"] diff --git a/k8s/argocd-app.yaml b/k8s/argocd-app.yaml new file mode 100644 index 0000000..8e0a968 --- /dev/null +++ b/k8s/argocd-app.yaml @@ -0,0 +1,43 @@ +--- +# ArgoCD Application - deploy this once to bootstrap: +# kubectl apply -f k8s/argocd-app.yaml +# +# Pre-requisite: Add the repo to ArgoCD first: +# argocd repo add https://repo.adservio.us/ai_approver/ollama-mcp.git \ +# --username --password +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ollama-mcp + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://repo.adservio.us/ai_approver/ollama-mcp.git + targetRevision: main + path: k8s + directory: + exclude: argocd-app.yaml + destination: + server: https://kubernetes.default.svc + namespace: ai-inference + syncPolicy: + automated: + prune: true + selfHeal: true + allowEmpty: false + syncOptions: + - CreateNamespace=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m + ignoreDifferences: + - group: apps + kind: Deployment + jsonPointers: + - /spec/replicas diff --git a/k8s/deployment.yaml b/k8s/deployment.yaml new file mode 100644 index 0000000..043fed9 --- /dev/null +++ b/k8s/deployment.yaml @@ -0,0 +1,95 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama-mcp + namespace: ai-inference + labels: + app: ollama-mcp + annotations: + argocd.argoproj.io/sync-wave: "10" +spec: + replicas: 1 + selector: + matchLabels: + app: ollama-mcp + template: + metadata: + labels: + app: ollama-mcp + spec: + containers: + - name: ollama-mcp + image: registry.storedbox.net/ollama-mcp:latest + ports: + - containerPort: 8090 + name: http + protocol: TCP + env: + - name: LITELLM_BASE_URL + value: "http://litellm.ai-inference.svc:4000" + - name: LITELLM_API_KEY + valueFrom: + secretKeyRef: + name: ollama-mcp-secrets + key: LITELLM_API_KEY + optional: true + - name: PORT + value: "8090" + - name: REQUEST_TIMEOUT + value: "120" + livenessProbe: + httpGet: + path: /health + port: 8090 + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8090 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "500m" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: NotIn + values: + - k3s-control-2 + - k3s-worker-3 + - k3s-worker-4 +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama-mcp + namespace: ai-inference + labels: + app: ollama-mcp + annotations: + metallb.universe.tf/loadBalancerIPs: "192.168.87.29" +spec: + type: LoadBalancer + ports: + - port: 8090 + targetPort: 8090 + protocol: TCP + name: http + selector: + app: ollama-mcp +--- +# Create the LiteLLM API key secret before deploying: +# kubectl create secret generic ollama-mcp-secrets -n ai-inference \ +# --from-literal=LITELLM_API_KEY=a699d6c80639dcf56d5fb8f2a99e50d220b5189dcc2fa1fdc8ccee4dab4df77e diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e36be0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +mcp>=1.0.0 +httpx>=0.27.0 +starlette>=0.41.0 +uvicorn[standard]>=0.32.0 diff --git a/src/server.py b/src/server.py new file mode 100644 index 0000000..ed3348d --- /dev/null +++ b/src/server.py @@ -0,0 +1,321 @@ +""" +Ollama MCP Server +Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code. +Runs as an HTTP/SSE MCP server in Kubernetes. +""" + +import os +import httpx +import logging +from mcp.server import Server +from mcp.server.sse import SseServerTransport +from mcp.types import Tool, TextContent +from starlette.applications import Starlette +from starlette.routing import Route, Mount +from starlette.requests import Request + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000") +LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "") +REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120")) + +# Model aliases exposed to Claude Code +MODELS = { + "deepseek-coder": "local/deepseek-coder", # deepseek-coder-v2:16b - code tasks + "llama3": "local/llama3", # llama3.1:8b - general tasks + "codellama": "local/codellama", # codellama:13b - code generation + "qwen-coder": "local/qwen-coder", # qwen2.5-coder:14b - code tasks +} + +DEFAULT_CODE_MODEL = "local/qwen-coder" +DEFAULT_GENERAL_MODEL = "local/llama3" + +app = Server("ollama-mcp") + + +async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str: + """Call LiteLLM proxy with the given model and messages.""" + headers = {"Content-Type": "application/json"} + if LITELLM_API_KEY: + headers["Authorization"] = f"Bearer {LITELLM_API_KEY}" + + payload = { + "model": model, + "messages": messages, + "max_tokens": max_tokens, + "stream": False, + } + + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + response = await client.post( + f"{LITELLM_BASE_URL}/v1/chat/completions", + headers=headers, + json=payload, + ) + response.raise_for_status() + data = response.json() + return data["choices"][0]["message"]["content"] + + +@app.list_tools() +async def list_tools() -> list[Tool]: + return [ + Tool( + name="query_local_model", + description=( + "Send a prompt to a local Ollama model via LiteLLM. " + "Use for tasks that don't require Claude's reasoning: summarization, " + "formatting, simple code generation, boilerplate, regex, quick lookups. " + f"Available models: {', '.join(MODELS.keys())}. " + "Defaults to qwen-coder for code tasks, llama3 for general." + ), + inputSchema={ + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "The prompt to send to the model.", + }, + "model": { + "type": "string", + "enum": list(MODELS.keys()), + "description": "Model to use. Omit to auto-select based on task type.", + }, + "task_type": { + "type": "string", + "enum": ["code", "general"], + "description": "Task type for auto model selection when model is not specified.", + "default": "general", + }, + "system_prompt": { + "type": "string", + "description": "Optional system prompt to guide model behavior.", + }, + "max_tokens": { + "type": "integer", + "description": "Maximum tokens in response (default: 2048).", + "default": 2048, + }, + }, + "required": ["prompt"], + }, + ), + Tool( + name="review_code", + description=( + "Review code for bugs, style issues, and improvements using a local code model. " + "Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic." + ), + inputSchema={ + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "The code to review.", + }, + "language": { + "type": "string", + "description": "Programming language (e.g. python, go, typescript).", + }, + "focus": { + "type": "string", + "description": "Optional focus area: bugs, style, security, performance, tests.", + }, + }, + "required": ["code"], + }, + ), + Tool( + name="summarize", + description=( + "Summarize text, logs, or documentation using a local model. " + "Use for large log files, changelogs, or documentation that doesn't need Claude's analysis." + ), + inputSchema={ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Text to summarize.", + }, + "format": { + "type": "string", + "enum": ["bullet_points", "paragraph", "key_facts"], + "description": "Output format for the summary.", + "default": "bullet_points", + }, + "max_length": { + "type": "string", + "enum": ["short", "medium", "detailed"], + "description": "Desired summary length.", + "default": "medium", + }, + }, + "required": ["text"], + }, + ), + Tool( + name="generate_boilerplate", + description=( + "Generate boilerplate code, configs, or scaffolding using a local code model. " + "Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc." + ), + inputSchema={ + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').", + }, + "context": { + "type": "string", + "description": "Optional additional context or constraints.", + }, + }, + "required": ["description"], + }, + ), + Tool( + name="list_models", + description="List available local Ollama models and their use cases.", + inputSchema={ + "type": "object", + "properties": {}, + }, + ), + ] + + +@app.call_tool() +async def call_tool(name: str, arguments: dict) -> list[TextContent]: + try: + if name == "list_models": + model_info = "\n".join([ + "Available local models (routed via LiteLLM):", + "- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis", + "- llama3: llama3.1:8b — general tasks, summarization, Q&A", + "- codellama: codellama:13b — code generation, completion", + "- qwen-coder: qwen2.5-coder:14b — code tasks, default for code", + "", + f"LiteLLM endpoint: {LITELLM_BASE_URL}", + ]) + return [TextContent(type="text", text=model_info)] + + elif name == "query_local_model": + prompt = arguments["prompt"] + task_type = arguments.get("task_type", "general") + system_prompt = arguments.get("system_prompt") + max_tokens = arguments.get("max_tokens", 2048) + + if "model" in arguments: + model = MODELS[arguments["model"]] + elif task_type == "code": + model = DEFAULT_CODE_MODEL + else: + model = DEFAULT_GENERAL_MODEL + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + result = await call_litellm(model, messages, max_tokens) + return [TextContent(type="text", text=result)] + + elif name == "review_code": + code = arguments["code"] + language = arguments.get("language", "") + focus = arguments.get("focus", "general code quality") + + lang_str = f" {language}" if language else "" + system = ( + f"You are an expert{lang_str} code reviewer. " + "Be concise and actionable. Flag actual issues, not style preferences unless asked. " + "Format: list issues with severity (critical/warning/info), then suggestions." + ) + prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```" + + result = await call_litellm("local/deepseek-coder", [ + {"role": "system", "content": system}, + {"role": "user", "content": prompt}, + ]) + return [TextContent(type="text", text=result)] + + elif name == "summarize": + text = arguments["text"] + fmt = arguments.get("format", "bullet_points") + length = arguments.get("max_length", "medium") + + length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"} + fmt_guide = { + "bullet_points": "Use bullet points.", + "paragraph": "Write in paragraph form.", + "key_facts": "Extract only key facts and numbers.", + } + + system = "You are a concise technical summarizer. Extract the most important information." + prompt = ( + f"Summarize the following text. {fmt_guide[fmt]} " + f"Length: {length_guide[length]}.\n\n{text}" + ) + + result = await call_litellm(DEFAULT_GENERAL_MODEL, [ + {"role": "system", "content": system}, + {"role": "user", "content": prompt}, + ]) + return [TextContent(type="text", text=result)] + + elif name == "generate_boilerplate": + description = arguments["description"] + context = arguments.get("context", "") + + system = ( + "You are an expert DevOps and software engineer. " + "Generate clean, production-ready boilerplate. " + "Output only the file content with no explanation unless asked." + ) + prompt = f"Generate: {description}" + if context: + prompt += f"\n\nContext/constraints:\n{context}" + + result = await call_litellm(DEFAULT_CODE_MODEL, [ + {"role": "system", "content": system}, + {"role": "user", "content": prompt}, + ]) + return [TextContent(type="text", text=result)] + + else: + return [TextContent(type="text", text=f"Unknown tool: {name}")] + + except httpx.HTTPStatusError as e: + logger.error("LiteLLM HTTP error: %s", e) + return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")] + except Exception as e: + logger.error("Tool error: %s", e) + return [TextContent(type="text", text=f"Error: {str(e)}")] + + +def create_starlette_app() -> Starlette: + sse = SseServerTransport("/messages/") + + async def handle_sse(request: Request): + async with sse.connect_sse(request.scope, request.receive, request._send) as streams: + await app.run(streams[0], streams[1], app.create_initialization_options()) + + return Starlette( + routes=[ + Route("/sse", endpoint=handle_sse), + Mount("/messages/", app=sse.handle_post_message), + Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})), + ] + ) + + +if __name__ == "__main__": + import uvicorn + port = int(os.environ.get("PORT", "8090")) + logger.info("Starting Ollama MCP server on port %d", port) + logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL) + uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)