Initial commit: Ollama MCP server

MCP server exposing local Ollama models via LiteLLM proxy to Claude Code. Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models. Deployed to k8s ai-inference namespace via ArgoCD. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:33:56 +00:00
commit 139a038505
6 changed files with 548 additions and 0 deletions
--- a/.gitea/workflows/build-deploy.yml
+++ b/.gitea/workflows/build-deploy.yml
@@ -0,0 +1,65 @@
 name: Build and Deploy
 on:
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
 jobs:
  build-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Build and push Docker image
        run: |
          docker login registry.storedbox.net \
            -u ${{ secrets.DOCKER_USER }} \
            -p ${{ secrets.DOCKER_PASSWORD }}
          docker build -t registry.storedbox.net/ollama-mcp:${{ github.sha }} .
          docker tag registry.storedbox.net/ollama-mcp:${{ github.sha }} \
            registry.storedbox.net/ollama-mcp:latest
          docker push registry.storedbox.net/ollama-mcp:${{ github.sha }}
          docker push registry.storedbox.net/ollama-mcp:latest
  deploy:
    runs-on: ubuntu-latest
    needs: build-push
    if: github.ref == 'refs/heads/main'
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Set up kubectl
        uses: azure/setup-kubectl@v3
      - name: Configure kubectl
        run: |
          echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig
          export KUBECONFIG=kubeconfig
      - name: Apply k8s manifests
        run: |
          kubectl apply -f k8s/deployment.yaml --kubeconfig=kubeconfig
      - name: Rollout restart to pull latest image
        run: |
          kubectl rollout restart deployment/ollama-mcp \
            -n ai-inference --kubeconfig=kubeconfig
          kubectl rollout status deployment/ollama-mcp \
            -n ai-inference --kubeconfig=kubeconfig --timeout=120s
      - name: Health check
        run: |
          sleep 10
          MCP_IP=$(kubectl get svc ollama-mcp -n ai-inference \
            -o jsonpath='{.status.loadBalancer.ingress[0].ip}' \
            --kubeconfig=kubeconfig)
          curl -f http://${MCP_IP}:8090/health || exit 1
          echo "MCP server healthy at http://${MCP_IP}:8090"
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
 FROM python:3.12-slim
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY src/ ./src/
 ENV PYTHONUNBUFFERED=1 \
    PORT=8090 \
    LITELLM_BASE_URL=http://litellm.ai-inference.svc:4000 \
    REQUEST_TIMEOUT=120
 EXPOSE 8090
 HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8090/health')"
 CMD ["python", "src/server.py"]
--- a/k8s/argocd-app.yaml
+++ b/k8s/argocd-app.yaml
@@ -0,0 +1,43 @@
 ---
 # ArgoCD Application - deploy this once to bootstrap:
 # kubectl apply -f k8s/argocd-app.yaml
 #
 # Pre-requisite: Add the repo to ArgoCD first:
 # argocd repo add https://repo.adservio.us/ai_approver/ollama-mcp.git \
 #   --username <gitea-user> --password <gitea-token>
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
  name: ollama-mcp
  namespace: argocd
  finalizers:
    - resources-finalizer.argocd.argoproj.io
 spec:
  project: default
  source:
    repoURL: https://repo.adservio.us/ai_approver/ollama-mcp.git
    targetRevision: main
    path: k8s
    directory:
      exclude: argocd-app.yaml
  destination:
    server: https://kubernetes.default.svc
    namespace: ai-inference
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
      allowEmpty: false
    syncOptions:
      - CreateNamespace=true
    retry:
      limit: 5
      backoff:
        duration: 5s
        factor: 2
        maxDuration: 3m
  ignoreDifferences:
    - group: apps
      kind: Deployment
      jsonPointers:
        - /spec/replicas
--- a/k8s/deployment.yaml
+++ b/k8s/deployment.yaml
@@ -0,0 +1,95 @@
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ollama-mcp
  namespace: ai-inference
  labels:
    app: ollama-mcp
  annotations:
    argocd.argoproj.io/sync-wave: "10"
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: ollama-mcp
  template:
    metadata:
      labels:
        app: ollama-mcp
    spec:
      containers:
      - name: ollama-mcp
        image: registry.storedbox.net/ollama-mcp:latest
        ports:
        - containerPort: 8090
          name: http
          protocol: TCP
        env:
        - name: LITELLM_BASE_URL
          value: "http://litellm.ai-inference.svc:4000"
        - name: LITELLM_API_KEY
          valueFrom:
            secretKeyRef:
              name: ollama-mcp-secrets
              key: LITELLM_API_KEY
              optional: true
        - name: PORT
          value: "8090"
        - name: REQUEST_TIMEOUT
          value: "120"
        livenessProbe:
          httpGet:
            path: /health
            port: 8090
          initialDelaySeconds: 15
          periodSeconds: 30
          timeoutSeconds: 5
        readinessProbe:
          httpGet:
            path: /health
            port: 8090
          initialDelaySeconds: 10
          periodSeconds: 10
          timeoutSeconds: 5
        resources:
          requests:
            memory: "128Mi"
            cpu: "100m"
          limits:
            memory: "256Mi"
            cpu: "500m"
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: kubernetes.io/hostname
                operator: NotIn
                values:
                - k3s-control-2
                - k3s-worker-3
                - k3s-worker-4
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: ollama-mcp
  namespace: ai-inference
  labels:
    app: ollama-mcp
  annotations:
    metallb.universe.tf/loadBalancerIPs: "192.168.87.29"
 spec:
  type: LoadBalancer
  ports:
  - port: 8090
    targetPort: 8090
    protocol: TCP
    name: http
  selector:
    app: ollama-mcp
 ---
 # Create the LiteLLM API key secret before deploying:
 # kubectl create secret generic ollama-mcp-secrets -n ai-inference \
 #   --from-literal=LITELLM_API_KEY=a699d6c80639dcf56d5fb8f2a99e50d220b5189dcc2fa1fdc8ccee4dab4df77e
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 mcp>=1.0.0
 httpx>=0.27.0
 starlette>=0.41.0
 uvicorn[standard]>=0.32.0
--- a/src/server.py
+++ b/src/server.py
@@ -0,0 +1,321 @@
 """
 Ollama MCP Server
 Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
 Runs as an HTTP/SSE MCP server in Kubernetes.
 """
 import os
 import httpx
 import logging
 from mcp.server import Server
 from mcp.server.sse import SseServerTransport
 from mcp.types import Tool, TextContent
 from starlette.applications import Starlette
 from starlette.routing import Route, Mount
 from starlette.requests import Request
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
 LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
 REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
 # Model aliases exposed to Claude Code
 MODELS = {
    "deepseek-coder": "local/deepseek-coder",   # deepseek-coder-v2:16b - code tasks
    "llama3": "local/llama3",                    # llama3.1:8b - general tasks
    "codellama": "local/codellama",              # codellama:13b - code generation
    "qwen-coder": "local/qwen-coder",            # qwen2.5-coder:14b - code tasks
 }
 DEFAULT_CODE_MODEL = "local/qwen-coder"
 DEFAULT_GENERAL_MODEL = "local/llama3"
 app = Server("ollama-mcp")
 async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
    """Call LiteLLM proxy with the given model and messages."""
    headers = {"Content-Type": "application/json"}
    if LITELLM_API_KEY:
        headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "stream": False,
    }
    async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
        response = await client.post(
            f"{LITELLM_BASE_URL}/v1/chat/completions",
            headers=headers,
            json=payload,
        )
        response.raise_for_status()
        data = response.json()
        return data["choices"][0]["message"]["content"]
@app.list_tools()
 async def list_tools() -> list[Tool]:
    return [
        Tool(
            name="query_local_model",
            description=(
                "Send a prompt to a local Ollama model via LiteLLM. "
                "Use for tasks that don't require Claude's reasoning: summarization, "
                "formatting, simple code generation, boilerplate, regex, quick lookups. "
                f"Available models: {', '.join(MODELS.keys())}. "
                "Defaults to qwen-coder for code tasks, llama3 for general."
            ),
            inputSchema={
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The prompt to send to the model.",
                    },
                    "model": {
                        "type": "string",
                        "enum": list(MODELS.keys()),
                        "description": "Model to use. Omit to auto-select based on task type.",
                    },
                    "task_type": {
                        "type": "string",
                        "enum": ["code", "general"],
                        "description": "Task type for auto model selection when model is not specified.",
                        "default": "general",
                    },
                    "system_prompt": {
                        "type": "string",
                        "description": "Optional system prompt to guide model behavior.",
                    },
                    "max_tokens": {
                        "type": "integer",
                        "description": "Maximum tokens in response (default: 2048).",
                        "default": 2048,
                    },
                },
                "required": ["prompt"],
            },
        ),
        Tool(
            name="review_code",
            description=(
                "Review code for bugs, style issues, and improvements using a local code model. "
                "Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
            ),
            inputSchema={
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": "The code to review.",
                    },
                    "language": {
                        "type": "string",
                        "description": "Programming language (e.g. python, go, typescript).",
                    },
                    "focus": {
                        "type": "string",
                        "description": "Optional focus area: bugs, style, security, performance, tests.",
                    },
                },
                "required": ["code"],
            },
        ),
        Tool(
            name="summarize",
            description=(
                "Summarize text, logs, or documentation using a local model. "
                "Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
            ),
            inputSchema={
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "Text to summarize.",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["bullet_points", "paragraph", "key_facts"],
                        "description": "Output format for the summary.",
                        "default": "bullet_points",
                    },
                    "max_length": {
                        "type": "string",
                        "enum": ["short", "medium", "detailed"],
                        "description": "Desired summary length.",
                        "default": "medium",
                    },
                },
                "required": ["text"],
            },
        ),
        Tool(
            name="generate_boilerplate",
            description=(
                "Generate boilerplate code, configs, or scaffolding using a local code model. "
                "Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
            ),
            inputSchema={
                "type": "object",
                "properties": {
                    "description": {
                        "type": "string",
                        "description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
                    },
                    "context": {
                        "type": "string",
                        "description": "Optional additional context or constraints.",
                    },
                },
                "required": ["description"],
            },
        ),
        Tool(
            name="list_models",
            description="List available local Ollama models and their use cases.",
            inputSchema={
                "type": "object",
                "properties": {},
            },
        ),
    ]
@app.call_tool()
 async def call_tool(name: str, arguments: dict) -> list[TextContent]:
    try:
        if name == "list_models":
            model_info = "\n".join([
                "Available local models (routed via LiteLLM):",
                "- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
                "- llama3: llama3.1:8b — general tasks, summarization, Q&A",
                "- codellama: codellama:13b — code generation, completion",
                "- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
                "",
                f"LiteLLM endpoint: {LITELLM_BASE_URL}",
            ])
            return [TextContent(type="text", text=model_info)]
        elif name == "query_local_model":
            prompt = arguments["prompt"]
            task_type = arguments.get("task_type", "general")
            system_prompt = arguments.get("system_prompt")
            max_tokens = arguments.get("max_tokens", 2048)
            if "model" in arguments:
                model = MODELS[arguments["model"]]
            elif task_type == "code":
                model = DEFAULT_CODE_MODEL
            else:
                model = DEFAULT_GENERAL_MODEL
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.append({"role": "user", "content": prompt})
            result = await call_litellm(model, messages, max_tokens)
            return [TextContent(type="text", text=result)]
        elif name == "review_code":
            code = arguments["code"]
            language = arguments.get("language", "")
            focus = arguments.get("focus", "general code quality")
            lang_str = f" {language}" if language else ""
            system = (
                f"You are an expert{lang_str} code reviewer. "
                "Be concise and actionable. Flag actual issues, not style preferences unless asked. "
                "Format: list issues with severity (critical/warning/info), then suggestions."
            )
            prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
            result = await call_litellm("local/deepseek-coder", [
                {"role": "system", "content": system},
                {"role": "user", "content": prompt},
            ])
            return [TextContent(type="text", text=result)]
        elif name == "summarize":
            text = arguments["text"]
            fmt = arguments.get("format", "bullet_points")
            length = arguments.get("max_length", "medium")
            length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
            fmt_guide = {
                "bullet_points": "Use bullet points.",
                "paragraph": "Write in paragraph form.",
                "key_facts": "Extract only key facts and numbers.",
            }
            system = "You are a concise technical summarizer. Extract the most important information."
            prompt = (
                f"Summarize the following text. {fmt_guide[fmt]} "
                f"Length: {length_guide[length]}.\n\n{text}"
            )
            result = await call_litellm(DEFAULT_GENERAL_MODEL, [
                {"role": "system", "content": system},
                {"role": "user", "content": prompt},
            ])
            return [TextContent(type="text", text=result)]
        elif name == "generate_boilerplate":
            description = arguments["description"]
            context = arguments.get("context", "")
            system = (
                "You are an expert DevOps and software engineer. "
                "Generate clean, production-ready boilerplate. "
                "Output only the file content with no explanation unless asked."
            )
            prompt = f"Generate: {description}"
            if context:
                prompt += f"\n\nContext/constraints:\n{context}"
            result = await call_litellm(DEFAULT_CODE_MODEL, [
                {"role": "system", "content": system},
                {"role": "user", "content": prompt},
            ])
            return [TextContent(type="text", text=result)]
        else:
            return [TextContent(type="text", text=f"Unknown tool: {name}")]
    except httpx.HTTPStatusError as e:
        logger.error("LiteLLM HTTP error: %s", e)
        return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
    except Exception as e:
        logger.error("Tool error: %s", e)
        return [TextContent(type="text", text=f"Error: {str(e)}")]
 def create_starlette_app() -> Starlette:
    sse = SseServerTransport("/messages/")
    async def handle_sse(request: Request):
        async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
            await app.run(streams[0], streams[1], app.create_initialization_options())
    return Starlette(
        routes=[
            Route("/sse", endpoint=handle_sse),
            Mount("/messages/", app=sse.handle_post_message),
            Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
        ]
    )
 if __name__ == "__main__":
    import uvicorn
    port = int(os.environ.get("PORT", "8090"))
    logger.info("Starting Ollama MCP server on port %d", port)
    logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
    uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)