Merge pull request #173 from svc-design/codex/add-dockerfiles-and-helm-chart-for-models-qc3pt7

Add CUDA LLM images and multi-model Helm chart
This commit is contained in:
shenlan 2025-10-27 16:29:37 +08:00 committed by GitHub
commit 03b7ba02fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 694 additions and 0 deletions

View File

@ -0,0 +1,36 @@
name: build chart multi-model llm
on:
pull_request:
branches:
- main
paths:
- 'oci/multi-model-LLM/**'
- '.github/workflows/build-chart-multi-model-LLM.yaml'
workflow_dispatch:
branches:
- main
env:
CHART_DIR: oci/multi-model-LLM/charts/model-serving
jobs:
lint-and-package:
name: Lint and package Helm chart
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v4
with:
version: v3.14.4
- name: Helm lint
run: helm lint $CHART_DIR
- name: Helm package
run: helm package $CHART_DIR --version 0.1.0 --app-version 0.1.0 -d oci/multi-model-LLM/charts
- name: Upload chart artifact
uses: actions/upload-artifact@v4
with:
name: model-serving-chart
path: oci/multi-model-LLM/charts/model-serving-0.1.0.tgz

View File

@ -0,0 +1,29 @@
name: build image ollama
on:
pull_request:
branches:
- main
paths:
- 'oci/base/cuda/Ollama/Dockerfile'
- '.github/workflows/build-ci-image-Ollama.yaml'
workflow_dispatch:
branches:
- main
env:
IMAGE_REPO: "artifact.svc.plus"
jobs:
build-ollama:
name: Build Ollama image
uses: svc-design/actions/.github/workflows/build-images.yaml@main
with:
method: 'docker'
registry_addr: "harbor.onwalk.net"
dockerfile_path: 'oci/base/cuda/Ollama'
image_name: 'public/base/cuda/ollama'
image_tag: 'latest'
secrets:
artifactory_sa: ${{ secrets.REPO_USER }}
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}

View File

@ -0,0 +1,29 @@
name: build image cuda sglang
on:
pull_request:
branches:
- main
paths:
- 'oci/base/cuda/SGLang/Dockerfile'
- '.github/workflows/build-ci-image-SGLang.yaml'
workflow_dispatch:
branches:
- main
env:
IMAGE_REPO: "artifact.svc.plus"
jobs:
build-sglang:
name: Build CUDA SGLang image
uses: svc-design/actions/.github/workflows/build-images.yaml@main
with:
method: 'docker'
registry_addr: "harbor.onwalk.net"
dockerfile_path: 'oci/base/cuda/SGLang'
image_name: 'public/base/cuda/sglang'
image_tag: 'cuda12'
secrets:
artifactory_sa: ${{ secrets.REPO_USER }}
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}

View File

@ -0,0 +1,29 @@
name: build image cuda vllm
on:
pull_request:
branches:
- main
paths:
- 'oci/base/cuda/vLLM/Dockerfile'
- '.github/workflows/build-ci-image-vLLM.yaml'
workflow_dispatch:
branches:
- main
env:
IMAGE_REPO: "artifact.svc.plus"
jobs:
build-vllm:
name: Build CUDA vLLM image
uses: svc-design/actions/.github/workflows/build-images.yaml@main
with:
method: 'docker'
registry_addr: "harbor.onwalk.net"
dockerfile_path: 'oci/base/cuda/vLLM'
image_name: 'public/base/cuda/vllm'
image_tag: 'cuda12'
secrets:
artifactory_sa: ${{ secrets.REPO_USER }}
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}

29
oci/base/cuda/Makefile Normal file
View File

@ -0,0 +1,29 @@
ORG ?= your-org
REGISTRY ?= ghcr.io/$(ORG)/model-serving
VLLM_TAG ?= cuda12
SGLANG_TAG ?= cuda12
OLLAMA_TAG ?= latest
.PHONY: docker-build docker-push docker-build-vllm docker-build-sglang docker-build-ollama docker-push-vllm docker-push-sglang docker-push-ollama
docker-build: docker-build-vllm docker-build-sglang docker-build-ollama
docker-push: docker-push-vllm docker-push-sglang docker-push-ollama
docker-build-vllm:
docker build -t $(REGISTRY)/vllm:$(VLLM_TAG) vLLM
docker-build-sglang:
docker build -t $(REGISTRY)/sglang:$(SGLANG_TAG) SGLang
docker-build-ollama:
docker build -t $(REGISTRY)/ollama:$(OLLAMA_TAG) Ollama
docker-push-vllm:
docker push $(REGISTRY)/vllm:$(VLLM_TAG)
docker-push-sglang:
docker push $(REGISTRY)/sglang:$(SGLANG_TAG)
docker-push-ollama:
docker push $(REGISTRY)/ollama:$(OLLAMA_TAG)

View File

@ -0,0 +1,28 @@
# Ollama runtime image that leans on host NVIDIA drivers via container toolkit
FROM ubuntu:22.04
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates unzip gnupg \
&& rm -rf /var/lib/apt/lists/*
RUN curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | tar -xz -C /usr/local/bin \
&& chmod +x /usr/local/bin/ollama
RUN useradd -m -u 10001 app && mkdir -p /home/app/.ollama && chown -R app:app /home/app
USER app
ENV OLLAMA_HOST=0.0.0.0:11434 \
OLLAMA_MODELS=/home/app/.ollama/models \
OLLAMA_MODEL="phi3:latest"
EXPOSE 11434
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:11434/api/tags || exit 1
ENTRYPOINT ["bash","-lc","set -euo pipefail; \
ollama serve & \
for i in $(seq 1 30); do sleep 1; curl -fsS http://127.0.0.1:11434/api/tags && break || true; done; \
ollama pull \"$OLLAMA_MODEL\" || true; \
wait -n"]

View File

@ -0,0 +1,30 @@
# CUDA 12.1 runtime base for SGLang
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-venv python3-pip git curl ca-certificates build-essential \
&& rm -rf /var/lib/apt/lists/*
ENV PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
RUN pip3 install --upgrade pip \
&& pip3 install --extra-index-url https://download.pytorch.org/whl/cu121 \
torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 \
&& pip3 install sglang==0.2.3 uvicorn fastapi
EXPOSE 30000
ENV SGLANG_MODEL="Qwen/Qwen2-7B-Instruct" \
SGLANG_PORT=30000 \
SGLANG_ARGS="--tp 1 --context-length 8192"
RUN useradd -m -u 10001 app && mkdir -p /models && chown -R app:app /models
USER app
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:${SGLANG_PORT}/v1/models || exit 1
ENTRYPOINT ["bash","-lc","python3 -m sglang.launch_server --model \"$SGLANG_MODEL\" --port $SGLANG_PORT --trust-remote-code --enable-openai-compatible-api $SGLANG_ARGS"]

View File

@ -0,0 +1,33 @@
# CUDA 12.1 + cuDNN8 runtime base — tested with recent PyTorch wheels
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
# System deps + Python
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-venv python3-pip git curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
ENV PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
# Install CUDA-enabled PyTorch + vLLM
RUN pip3 install --upgrade pip \
&& pip3 install --extra-index-url https://download.pytorch.org/whl/cu121 \
torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 \
&& pip3 install vllm==0.5.2 uvicorn fastapi
EXPOSE 8000
ENV MODEL_PATH="meta-llama/Meta-Llama-3-8B-Instruct" \
VLLM_ARGS="--max-model-len 8192 --gpu-memory-utilization 0.9" \
HF_HOME=/models/.cache \
VLLM_WORKER_USE_GRAPH_EXECUTOR=1
RUN useradd -m -u 10001 app && mkdir -p /models && chown -R app:app /models
USER app
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:8000/v1/models || exit 1
ENTRYPOINT ["bash","-lc","vllm serve \"$MODEL_PATH\" --port 8000 --api-key dummy $VLLM_ARGS"]

View File

@ -0,0 +1,35 @@
ORG ?= your-org
IMAGE_REGISTRY ?= ghcr.io/$(ORG)/model-serving
CHART_NAME ?= model-serving
CHART_DIR := charts/$(CHART_NAME)
VERSION ?= 0.1.0
.PHONY: docker-build docker-push helm-lint helm-package helm-push install uninstall template
docker-build:
$(MAKE) -C ../base/cuda docker-build REGISTRY=$(IMAGE_REGISTRY)
docker-push:
$(MAKE) -C ../base/cuda docker-push REGISTRY=$(IMAGE_REGISTRY)
helm-lint:
helm lint $(CHART_DIR)
helm-package:
helm package $(CHART_DIR) --version $(VERSION) --app-version $(VERSION) -d charts/
helm-push: helm-package
helm push charts/$(CHART_NAME)-$(VERSION).tgz oci://ghcr.io/$(ORG)/helm
RELEASE ?= ms
NAMESPACE ?= llm
install:
kubectl create ns $(NAMESPACE) --dry-run=client -o yaml | kubectl apply -f -
helm upgrade --install $(RELEASE) $(CHART_DIR) -n $(NAMESPACE)
uninstall:
helm uninstall $(RELEASE) -n $(NAMESPACE) || true
template:
helm template $(RELEASE) $(CHART_DIR)

View File

@ -0,0 +1,101 @@
# CUDA LLM Serving — vLLM / SGLang / Ollama (Kubernetes)
This package bundles three CUDA-ready images plus a single Helm chart that can serve **multiple models** behind one host with **path-based routing** such as:
- `https://api.svc.plus/v1/llama3` → vLLM (OpenAI-compatible)
- `https://api.svc.plus/v1/qwen2` → SGLang (OpenAI-compatible)
- `https://api.svc.plus/v1/phi3` → Ollama `/api/*`
The Dockerfiles live under [`oci/base/cuda`](../base/cuda/), while the Helm chart is in [`charts/model-serving`](charts/model-serving/).
## Prerequisites
- Kubernetes ≥ 1.25
- NVIDIA GPUs on worker nodes + NVIDIA Container Toolkit
- Ingress Controller (e.g. NGINX) and TLS secret if using HTTPS
- (Optional) GitHub Container Registry (GHCR) for distributing images and charts
## Build & Publish
```bash
# 1) Build and push images to GHCR (adjust ORG)
make -C oci/base/cuda ORG=svc-design docker-build docker-push
# 2) Lint & package the chart
make -C oci/multi-model-LLM helm-lint helm-package VERSION=0.1.0
# 3) Push chart as OCI to GHCR
make -C oci/multi-model-LLM ORG=svc-design VERSION=0.1.0 helm-push
```
> Authenticate GHCR first:
>
> ```bash
> echo $GHCR_TOKEN | docker login ghcr.io -u <GITHUB_USER> --password-stdin
> helm registry login ghcr.io -u <GITHUB_USER> -p $GHCR_TOKEN
> ```
## Install
```bash
# install into namespace llm with release name ms
make -C oci/multi-model-LLM install RELEASE=ms NAMESPACE=llm
```
## Configure Models
Edit [`charts/model-serving/values.yaml`](charts/model-serving/values.yaml) and extend the `models:` list. Example:
```yaml
models:
- name: llama3-8b-vllm
engine: vllm
image: "model-serving/vllm"
tag: "cuda12"
path: v1/llama3
env:
- name: MODEL_PATH
value: meta-llama/Meta-Llama-3-8B-Instruct
- name: VLLM_ARGS
value: --max-model-len 8192 --gpu-memory-utilization 0.9
resources:
limits:
nvidia.com/gpu: 1
- name: qwen2-7b-sglang
engine: sglang
image: "model-serving/sglang"
tag: "cuda12"
path: v1/qwen2
env:
- name: SGLANG_MODEL
value: Qwen/Qwen2-7B-Instruct
- name: phi3-ollama
engine: ollama
image: "model-serving/ollama"
tag: latest
path: v1/phi3
env:
- name: OLLAMA_MODEL
value: phi3:latest
```
Deployments and services are generated per model, and a single ingress exposes them under unique paths.
## Runtime Notes
* **GPU scheduling**: Templates set `runtimeClassName: nvidia` and default GPU limits. Ensure the cluster has the NVIDIA device plugin and RuntimeClass defined, or override `runtimeClassName` per model.
* **Storage**: vLLM/SGLang cache defaults to the container filesystem. Mount an external volume by extending the template if persistence is required.
* **Authentication**: vLLM launches with a dummy API key. Place an API gateway or ingress authentication in front for production.
* **Scaling**: Increase `replicas` per model and add engine-specific flags through environment variables for tensor parallelism or sharding.
## Uninstall
```bash
make -C oci/multi-model-LLM uninstall RELEASE=ms NAMESPACE=llm
```
## License
MIT

View File

@ -0,0 +1,6 @@
*.tgz
*.swp
*.swo
.DS_Store
.git/
.github/

View File

@ -0,0 +1,14 @@
apiVersion: v2
name: model-serving
version: 0.1.0
kubeVersion: ">=1.25.0"
description: Multi-model LLM serving (vLLM / SGLang / Ollama) with one API host & path routing
home: https://github.com/svc-design/artifacts
keywords:
- llm
- vllm
- sglang
- ollama
- cuda
- gpu
type: application

View File

@ -0,0 +1,14 @@
{{- define "model-serving.fullname" -}}
{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- define "model-serving.svcname" -}}
{{- printf "%s-svc" (include "model-serving.fullname" .) -}}
{{- end -}}
{{- define "model-serving.image" -}}
{{- $reg := .Values.global.imageRegistry -}}
{{- $img := .image -}}
{{- $tag := .tag | default "latest" -}}
{{- printf "%s/%s:%s" $reg $img $tag -}}
{{- end -}}

View File

@ -0,0 +1,58 @@
{{- $root := . -}}
{{- range $m := .Values.models }}
{{- if eq $m.engine "ollama" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: ollama
spec:
replicas: {{ $m.replicas | default 1 }}
selector:
matchLabels:
app.kubernetes.io/name: {{ $m.name }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: ollama
spec:
containers:
- name: {{ $m.name }}
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
imagePullPolicy: IfNotPresent
{{- if $m.env }}
env:
{{- range $m.env }}
- name: {{ .name }}
value: {{ .value | quote }}
{{- end }}
{{- end }}
ports:
- containerPort: 11434
{{- if $m.resources }}
resources:
{{ toYaml $m.resources | nindent 12 }}
{{- else }}
resources: {}
{{- end }}
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
{{- if $m.nodeSelector }}
nodeSelector:
{{ toYaml $m.nodeSelector | nindent 8 }}
{{- end }}
{{- if $m.tolerations }}
tolerations:
{{ toYaml $m.tolerations | nindent 8 }}
{{- end }}
{{- if $root.Values.imagePullSecrets }}
imagePullSecrets:
{{- range $root.Values.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
---
{{- end }}
{{- end }}

View File

@ -0,0 +1,58 @@
{{- $root := . -}}
{{- range $m := .Values.models }}
{{- if eq $m.engine "sglang" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: sglang
spec:
replicas: {{ $m.replicas | default 1 }}
selector:
matchLabels:
app.kubernetes.io/name: {{ $m.name }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: sglang
spec:
containers:
- name: {{ $m.name }}
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
imagePullPolicy: IfNotPresent
{{- if $m.env }}
env:
{{- range $m.env }}
- name: {{ .name }}
value: {{ .value | quote }}
{{- end }}
{{- end }}
ports:
- containerPort: 30000
{{- if $m.resources }}
resources:
{{ toYaml $m.resources | nindent 12 }}
{{- else }}
resources: {}
{{- end }}
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
{{- if $m.nodeSelector }}
nodeSelector:
{{ toYaml $m.nodeSelector | nindent 8 }}
{{- end }}
{{- if $m.tolerations }}
tolerations:
{{ toYaml $m.tolerations | nindent 8 }}
{{- end }}
{{- if $root.Values.imagePullSecrets }}
imagePullSecrets:
{{- range $root.Values.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
---
{{- end }}
{{- end }}

View File

@ -0,0 +1,58 @@
{{- $root := . -}}
{{- range $m := .Values.models }}
{{- if eq $m.engine "vllm" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: vllm
spec:
replicas: {{ $m.replicas | default 1 }}
selector:
matchLabels:
app.kubernetes.io/name: {{ $m.name }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ $m.name }}
app.kubernetes.io/engine: vllm
spec:
containers:
- name: {{ $m.name }}
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
imagePullPolicy: IfNotPresent
{{- if $m.env }}
env:
{{- range $m.env }}
- name: {{ .name }}
value: {{ .value | quote }}
{{- end }}
{{- end }}
ports:
- containerPort: 8000
{{- if $m.resources }}
resources:
{{ toYaml $m.resources | nindent 12 }}
{{- else }}
resources: {}
{{- end }}
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
{{- if $m.nodeSelector }}
nodeSelector:
{{ toYaml $m.nodeSelector | nindent 8 }}
{{- end }}
{{- if $m.tolerations }}
tolerations:
{{ toYaml $m.tolerations | nindent 8 }}
{{- end }}
{{- if $root.Values.imagePullSecrets }}
imagePullSecrets:
{{- range $root.Values.imagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
---
{{- end }}
{{- end }}

View File

@ -0,0 +1,28 @@
{{- if .Values.global.ingress.enabled }}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "model-serving.fullname" . }}
annotations:
kubernetes.io/ingress.class: {{ .Values.global.ingress.className | quote }}
spec:
{{- if .Values.global.ingress.tls }}
tls:
- hosts:
- {{ .Values.global.ingress.host | quote }}
secretName: {{ .Values.global.ingress.tlsSecretName | quote }}
{{- end }}
rules:
- host: {{ .Values.global.ingress.host | quote }}
http:
paths:
{{- range .Values.models }}
- path: /{{ .path }}
pathType: Prefix
backend:
service:
name: {{ include "model-serving.fullname" $ }}-{{ .name }}
port:
number: {{ $.Values.service.port | default 80 }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,18 @@
{{- $root := . -}}
{{- range $m := .Values.models }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
labels:
app.kubernetes.io/name: {{ $m.name }}
spec:
type: {{ $root.Values.service.type | default "ClusterIP" }}
selector:
app.kubernetes.io/name: {{ $m.name }}
ports:
- name: http
port: {{ $root.Values.service.port | default 80 }}
targetPort: {{ if eq $m.engine "vllm" }}8000{{ else if eq $m.engine "sglang" }}30000{{ else }}11434{{ end }}
---
{{- end }}

View File

@ -0,0 +1,61 @@
global:
imageRegistry: ghcr.io/your-org
namespace: default
ingress:
enabled: true
className: nginx
host: api.svc.plus
tls: true
tlsSecretName: model-serving-tls
models:
- name: llama3-8b-vllm
engine: vllm
image: "model-serving/vllm"
tag: "cuda12"
replicas: 1
path: v1/llama3
env:
- name: MODEL_PATH
value: "meta-llama/Meta-Llama-3-8B-Instruct"
- name: VLLM_ARGS
value: "--max-model-len 8192 --gpu-memory-utilization 0.9"
resources:
limits:
nvidia.com/gpu: 1
nodeSelector: {}
tolerations: []
- name: qwen2-7b-sglang
engine: sglang
image: "model-serving/sglang"
tag: "cuda12"
replicas: 1
path: v1/qwen2
env:
- name: SGLANG_MODEL
value: "Qwen/Qwen2-7B-Instruct"
- name: SGLANG_ARGS
value: "--tp 1 --context-length 8192"
resources:
limits:
nvidia.com/gpu: 1
- name: phi3-ollama
engine: ollama
image: "model-serving/ollama"
tag: "latest"
replicas: 1
path: v1/phi3
env:
- name: OLLAMA_MODEL
value: "phi3:latest"
resources:
limits:
nvidia.com/gpu: 1
service:
type: ClusterIP
port: 80
imagePullSecrets: []