commit 139a038505b296351b42ab0a8d4926dd0c9ac57c
Author: ai_approver <ai_approver@themosers.club>
Date:   Sat Mar 21 17:33:56 2026 +0000

    Initial commit: Ollama MCP server
    
    MCP server exposing local Ollama models via LiteLLM proxy to Claude Code.
    Tools: query_local_model, review_code, summarize, generate_boilerplate, list_models.
    Deployed to k8s ai-inference namespace via ArgoCD.
    
    Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

diff --git a/.gitea/workflows/build-deploy.yml b/.gitea/workflows/build-deploy.yml
new file mode 100644
index 0000000..531b58e
--- /dev/null
+++ b/.gitea/workflows/build-deploy.yml
@@ -0,0 +1,65 @@
+name: Build and Deploy
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build-push:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build and push Docker image
+        run: |
+          docker login registry.storedbox.net \
+            -u ${{ secrets.DOCKER_USER }} \
+            -p ${{ secrets.DOCKER_PASSWORD }}
+
+          docker build -t registry.storedbox.net/ollama-mcp:${{ github.sha }} .
+          docker tag registry.storedbox.net/ollama-mcp:${{ github.sha }} \
+            registry.storedbox.net/ollama-mcp:latest
+
+          docker push registry.storedbox.net/ollama-mcp:${{ github.sha }}
+          docker push registry.storedbox.net/ollama-mcp:latest
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build-push
+    if: github.ref == 'refs/heads/main'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up kubectl
+        uses: azure/setup-kubectl@v3
+
+      - name: Configure kubectl
+        run: |
+          echo "${{ secrets.KUBE_CONFIG }}" > kubeconfig
+          export KUBECONFIG=kubeconfig
+
+      - name: Apply k8s manifests
+        run: |
+          kubectl apply -f k8s/deployment.yaml --kubeconfig=kubeconfig
+
+      - name: Rollout restart to pull latest image
+        run: |
+          kubectl rollout restart deployment/ollama-mcp \
+            -n ai-inference --kubeconfig=kubeconfig
+          kubectl rollout status deployment/ollama-mcp \
+            -n ai-inference --kubeconfig=kubeconfig --timeout=120s
+
+      - name: Health check
+        run: |
+          sleep 10
+          MCP_IP=$(kubectl get svc ollama-mcp -n ai-inference \
+            -o jsonpath='{.status.loadBalancer.ingress[0].ip}' \
+            --kubeconfig=kubeconfig)
+          curl -f http://${MCP_IP}:8090/health || exit 1
+          echo "MCP server healthy at http://${MCP_IP}:8090"
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..715ad73
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src/ ./src/
+
+ENV PYTHONUNBUFFERED=1 \
+    PORT=8090 \
+    LITELLM_BASE_URL=http://litellm.ai-inference.svc:4000 \
+    REQUEST_TIMEOUT=120
+
+EXPOSE 8090
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8090/health')"
+
+CMD ["python", "src/server.py"]
diff --git a/k8s/argocd-app.yaml b/k8s/argocd-app.yaml
new file mode 100644
index 0000000..8e0a968
--- /dev/null
+++ b/k8s/argocd-app.yaml
@@ -0,0 +1,43 @@
+---
+# ArgoCD Application - deploy this once to bootstrap:
+# kubectl apply -f k8s/argocd-app.yaml
+#
+# Pre-requisite: Add the repo to ArgoCD first:
+# argocd repo add https://repo.adservio.us/ai_approver/ollama-mcp.git \
+#   --username <gitea-user> --password <gitea-token>
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: ollama-mcp
+  namespace: argocd
+  finalizers:
+    - resources-finalizer.argocd.argoproj.io
+spec:
+  project: default
+  source:
+    repoURL: https://repo.adservio.us/ai_approver/ollama-mcp.git
+    targetRevision: main
+    path: k8s
+    directory:
+      exclude: argocd-app.yaml
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: ai-inference
+  syncPolicy:
+    automated:
+      prune: true
+      selfHeal: true
+      allowEmpty: false
+    syncOptions:
+      - CreateNamespace=true
+    retry:
+      limit: 5
+      backoff:
+        duration: 5s
+        factor: 2
+        maxDuration: 3m
+  ignoreDifferences:
+    - group: apps
+      kind: Deployment
+      jsonPointers:
+        - /spec/replicas
diff --git a/k8s/deployment.yaml b/k8s/deployment.yaml
new file mode 100644
index 0000000..043fed9
--- /dev/null
+++ b/k8s/deployment.yaml
@@ -0,0 +1,95 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ollama-mcp
+  namespace: ai-inference
+  labels:
+    app: ollama-mcp
+  annotations:
+    argocd.argoproj.io/sync-wave: "10"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: ollama-mcp
+  template:
+    metadata:
+      labels:
+        app: ollama-mcp
+    spec:
+      containers:
+      - name: ollama-mcp
+        image: registry.storedbox.net/ollama-mcp:latest
+        ports:
+        - containerPort: 8090
+          name: http
+          protocol: TCP
+        env:
+        - name: LITELLM_BASE_URL
+          value: "http://litellm.ai-inference.svc:4000"
+        - name: LITELLM_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: ollama-mcp-secrets
+              key: LITELLM_API_KEY
+              optional: true
+        - name: PORT
+          value: "8090"
+        - name: REQUEST_TIMEOUT
+          value: "120"
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8090
+          initialDelaySeconds: 15
+          periodSeconds: 30
+          timeoutSeconds: 5
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8090
+          initialDelaySeconds: 10
+          periodSeconds: 10
+          timeoutSeconds: 5
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "100m"
+          limits:
+            memory: "256Mi"
+            cpu: "500m"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/hostname
+                operator: NotIn
+                values:
+                - k3s-control-2
+                - k3s-worker-3
+                - k3s-worker-4
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ollama-mcp
+  namespace: ai-inference
+  labels:
+    app: ollama-mcp
+  annotations:
+    metallb.universe.tf/loadBalancerIPs: "192.168.87.29"
+spec:
+  type: LoadBalancer
+  ports:
+  - port: 8090
+    targetPort: 8090
+    protocol: TCP
+    name: http
+  selector:
+    app: ollama-mcp
+---
+# Create the LiteLLM API key secret before deploying:
+# kubectl create secret generic ollama-mcp-secrets -n ai-inference \
+#   --from-literal=LITELLM_API_KEY=a699d6c80639dcf56d5fb8f2a99e50d220b5189dcc2fa1fdc8ccee4dab4df77e
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e36be0d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+mcp>=1.0.0
+httpx>=0.27.0
+starlette>=0.41.0
+uvicorn[standard]>=0.32.0
diff --git a/src/server.py b/src/server.py
new file mode 100644
index 0000000..ed3348d
--- /dev/null
+++ b/src/server.py
@@ -0,0 +1,321 @@
+"""
+Ollama MCP Server
+Exposes local LLM models (via LiteLLM proxy) as MCP tools for Claude Code.
+Runs as an HTTP/SSE MCP server in Kubernetes.
+"""
+
+import os
+import httpx
+import logging
+from mcp.server import Server
+from mcp.server.sse import SseServerTransport
+from mcp.types import Tool, TextContent
+from starlette.applications import Starlette
+from starlette.routing import Route, Mount
+from starlette.requests import Request
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+LITELLM_BASE_URL = os.environ.get("LITELLM_BASE_URL", "http://litellm.ai-inference.svc:4000")
+LITELLM_API_KEY = os.environ.get("LITELLM_API_KEY", "")
+REQUEST_TIMEOUT = int(os.environ.get("REQUEST_TIMEOUT", "120"))
+
+# Model aliases exposed to Claude Code
+MODELS = {
+    "deepseek-coder": "local/deepseek-coder",   # deepseek-coder-v2:16b - code tasks
+    "llama3": "local/llama3",                    # llama3.1:8b - general tasks
+    "codellama": "local/codellama",              # codellama:13b - code generation
+    "qwen-coder": "local/qwen-coder",            # qwen2.5-coder:14b - code tasks
+}
+
+DEFAULT_CODE_MODEL = "local/qwen-coder"
+DEFAULT_GENERAL_MODEL = "local/llama3"
+
+app = Server("ollama-mcp")
+
+
+async def call_litellm(model: str, messages: list, max_tokens: int = 2048) -> str:
+    """Call LiteLLM proxy with the given model and messages."""
+    headers = {"Content-Type": "application/json"}
+    if LITELLM_API_KEY:
+        headers["Authorization"] = f"Bearer {LITELLM_API_KEY}"
+
+    payload = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "stream": False,
+    }
+
+    async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
+        response = await client.post(
+            f"{LITELLM_BASE_URL}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
+
+
+@app.list_tools()
+async def list_tools() -> list[Tool]:
+    return [
+        Tool(
+            name="query_local_model",
+            description=(
+                "Send a prompt to a local Ollama model via LiteLLM. "
+                "Use for tasks that don't require Claude's reasoning: summarization, "
+                "formatting, simple code generation, boilerplate, regex, quick lookups. "
+                f"Available models: {', '.join(MODELS.keys())}. "
+                "Defaults to qwen-coder for code tasks, llama3 for general."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "prompt": {
+                        "type": "string",
+                        "description": "The prompt to send to the model.",
+                    },
+                    "model": {
+                        "type": "string",
+                        "enum": list(MODELS.keys()),
+                        "description": "Model to use. Omit to auto-select based on task type.",
+                    },
+                    "task_type": {
+                        "type": "string",
+                        "enum": ["code", "general"],
+                        "description": "Task type for auto model selection when model is not specified.",
+                        "default": "general",
+                    },
+                    "system_prompt": {
+                        "type": "string",
+                        "description": "Optional system prompt to guide model behavior.",
+                    },
+                    "max_tokens": {
+                        "type": "integer",
+                        "description": "Maximum tokens in response (default: 2048).",
+                        "default": 2048,
+                    },
+                },
+                "required": ["prompt"],
+            },
+        ),
+        Tool(
+            name="review_code",
+            description=(
+                "Review code for bugs, style issues, and improvements using a local code model. "
+                "Uses deepseek-coder-v2:16b. Good for quick reviews without sending code to Anthropic."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": "The code to review.",
+                    },
+                    "language": {
+                        "type": "string",
+                        "description": "Programming language (e.g. python, go, typescript).",
+                    },
+                    "focus": {
+                        "type": "string",
+                        "description": "Optional focus area: bugs, style, security, performance, tests.",
+                    },
+                },
+                "required": ["code"],
+            },
+        ),
+        Tool(
+            name="summarize",
+            description=(
+                "Summarize text, logs, or documentation using a local model. "
+                "Use for large log files, changelogs, or documentation that doesn't need Claude's analysis."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "Text to summarize.",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["bullet_points", "paragraph", "key_facts"],
+                        "description": "Output format for the summary.",
+                        "default": "bullet_points",
+                    },
+                    "max_length": {
+                        "type": "string",
+                        "enum": ["short", "medium", "detailed"],
+                        "description": "Desired summary length.",
+                        "default": "medium",
+                    },
+                },
+                "required": ["text"],
+            },
+        ),
+        Tool(
+            name="generate_boilerplate",
+            description=(
+                "Generate boilerplate code, configs, or scaffolding using a local code model. "
+                "Use for Dockerfiles, k8s manifests, CI configs, test stubs, etc."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "description": {
+                        "type": "string",
+                        "description": "What to generate (e.g. 'Dockerfile for a Python FastAPI app').",
+                    },
+                    "context": {
+                        "type": "string",
+                        "description": "Optional additional context or constraints.",
+                    },
+                },
+                "required": ["description"],
+            },
+        ),
+        Tool(
+            name="list_models",
+            description="List available local Ollama models and their use cases.",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+            },
+        ),
+    ]
+
+
+@app.call_tool()
+async def call_tool(name: str, arguments: dict) -> list[TextContent]:
+    try:
+        if name == "list_models":
+            model_info = "\n".join([
+                "Available local models (routed via LiteLLM):",
+                "- deepseek-coder: deepseek-coder-v2:16b — code review, debugging, analysis",
+                "- llama3: llama3.1:8b — general tasks, summarization, Q&A",
+                "- codellama: codellama:13b — code generation, completion",
+                "- qwen-coder: qwen2.5-coder:14b — code tasks, default for code",
+                "",
+                f"LiteLLM endpoint: {LITELLM_BASE_URL}",
+            ])
+            return [TextContent(type="text", text=model_info)]
+
+        elif name == "query_local_model":
+            prompt = arguments["prompt"]
+            task_type = arguments.get("task_type", "general")
+            system_prompt = arguments.get("system_prompt")
+            max_tokens = arguments.get("max_tokens", 2048)
+
+            if "model" in arguments:
+                model = MODELS[arguments["model"]]
+            elif task_type == "code":
+                model = DEFAULT_CODE_MODEL
+            else:
+                model = DEFAULT_GENERAL_MODEL
+
+            messages = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": prompt})
+
+            result = await call_litellm(model, messages, max_tokens)
+            return [TextContent(type="text", text=result)]
+
+        elif name == "review_code":
+            code = arguments["code"]
+            language = arguments.get("language", "")
+            focus = arguments.get("focus", "general code quality")
+
+            lang_str = f" {language}" if language else ""
+            system = (
+                f"You are an expert{lang_str} code reviewer. "
+                "Be concise and actionable. Flag actual issues, not style preferences unless asked. "
+                "Format: list issues with severity (critical/warning/info), then suggestions."
+            )
+            prompt = f"Review this{lang_str} code focusing on {focus}:\n\n```{language}\n{code}\n```"
+
+            result = await call_litellm("local/deepseek-coder", [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        elif name == "summarize":
+            text = arguments["text"]
+            fmt = arguments.get("format", "bullet_points")
+            length = arguments.get("max_length", "medium")
+
+            length_guide = {"short": "2-3 sentences", "medium": "5-8 points", "detailed": "comprehensive"}
+            fmt_guide = {
+                "bullet_points": "Use bullet points.",
+                "paragraph": "Write in paragraph form.",
+                "key_facts": "Extract only key facts and numbers.",
+            }
+
+            system = "You are a concise technical summarizer. Extract the most important information."
+            prompt = (
+                f"Summarize the following text. {fmt_guide[fmt]} "
+                f"Length: {length_guide[length]}.\n\n{text}"
+            )
+
+            result = await call_litellm(DEFAULT_GENERAL_MODEL, [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        elif name == "generate_boilerplate":
+            description = arguments["description"]
+            context = arguments.get("context", "")
+
+            system = (
+                "You are an expert DevOps and software engineer. "
+                "Generate clean, production-ready boilerplate. "
+                "Output only the file content with no explanation unless asked."
+            )
+            prompt = f"Generate: {description}"
+            if context:
+                prompt += f"\n\nContext/constraints:\n{context}"
+
+            result = await call_litellm(DEFAULT_CODE_MODEL, [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ])
+            return [TextContent(type="text", text=result)]
+
+        else:
+            return [TextContent(type="text", text=f"Unknown tool: {name}")]
+
+    except httpx.HTTPStatusError as e:
+        logger.error("LiteLLM HTTP error: %s", e)
+        return [TextContent(type="text", text=f"LiteLLM error {e.response.status_code}: {e.response.text}")]
+    except Exception as e:
+        logger.error("Tool error: %s", e)
+        return [TextContent(type="text", text=f"Error: {str(e)}")]
+
+
+def create_starlette_app() -> Starlette:
+    sse = SseServerTransport("/messages/")
+
+    async def handle_sse(request: Request):
+        async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
+            await app.run(streams[0], streams[1], app.create_initialization_options())
+
+    return Starlette(
+        routes=[
+            Route("/sse", endpoint=handle_sse),
+            Mount("/messages/", app=sse.handle_post_message),
+            Route("/health", endpoint=lambda r: __import__("starlette.responses", fromlist=["JSONResponse"]).JSONResponse({"status": "ok"})),
+        ]
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", "8090"))
+    logger.info("Starting Ollama MCP server on port %d", port)
+    logger.info("LiteLLM base URL: %s", LITELLM_BASE_URL)
+    uvicorn.run(create_starlette_app(), host="0.0.0.0", port=port)