Merge pull request #173 from svc-design/codex/add-dockerfiles-and-helm-chart-for-models-qc3pt7
Add CUDA LLM images and multi-model Helm chart
This commit is contained in:
commit
03b7ba02fc
36
.github/workflows/build-chart-multi-model-LLM.yaml
vendored
Normal file
36
.github/workflows/build-chart-multi-model-LLM.yaml
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
name: build chart multi-model llm
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'oci/multi-model-LLM/**'
|
||||
- '.github/workflows/build-chart-multi-model-LLM.yaml'
|
||||
workflow_dispatch:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
CHART_DIR: oci/multi-model-LLM/charts/model-serving
|
||||
|
||||
jobs:
|
||||
lint-and-package:
|
||||
name: Lint and package Helm chart
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v4
|
||||
with:
|
||||
version: v3.14.4
|
||||
- name: Helm lint
|
||||
run: helm lint $CHART_DIR
|
||||
- name: Helm package
|
||||
run: helm package $CHART_DIR --version 0.1.0 --app-version 0.1.0 -d oci/multi-model-LLM/charts
|
||||
- name: Upload chart artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: model-serving-chart
|
||||
path: oci/multi-model-LLM/charts/model-serving-0.1.0.tgz
|
||||
29
.github/workflows/build-ci-image-Ollama.yaml
vendored
Normal file
29
.github/workflows/build-ci-image-Ollama.yaml
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
name: build image ollama
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'oci/base/cuda/Ollama/Dockerfile'
|
||||
- '.github/workflows/build-ci-image-Ollama.yaml'
|
||||
workflow_dispatch:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
IMAGE_REPO: "artifact.svc.plus"
|
||||
|
||||
jobs:
|
||||
build-ollama:
|
||||
name: Build Ollama image
|
||||
uses: svc-design/actions/.github/workflows/build-images.yaml@main
|
||||
with:
|
||||
method: 'docker'
|
||||
registry_addr: "harbor.onwalk.net"
|
||||
dockerfile_path: 'oci/base/cuda/Ollama'
|
||||
image_name: 'public/base/cuda/ollama'
|
||||
image_tag: 'latest'
|
||||
secrets:
|
||||
artifactory_sa: ${{ secrets.REPO_USER }}
|
||||
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}
|
||||
29
.github/workflows/build-ci-image-SGLang.yaml
vendored
Normal file
29
.github/workflows/build-ci-image-SGLang.yaml
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
name: build image cuda sglang
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'oci/base/cuda/SGLang/Dockerfile'
|
||||
- '.github/workflows/build-ci-image-SGLang.yaml'
|
||||
workflow_dispatch:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
IMAGE_REPO: "artifact.svc.plus"
|
||||
|
||||
jobs:
|
||||
build-sglang:
|
||||
name: Build CUDA SGLang image
|
||||
uses: svc-design/actions/.github/workflows/build-images.yaml@main
|
||||
with:
|
||||
method: 'docker'
|
||||
registry_addr: "harbor.onwalk.net"
|
||||
dockerfile_path: 'oci/base/cuda/SGLang'
|
||||
image_name: 'public/base/cuda/sglang'
|
||||
image_tag: 'cuda12'
|
||||
secrets:
|
||||
artifactory_sa: ${{ secrets.REPO_USER }}
|
||||
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}
|
||||
29
.github/workflows/build-ci-image-vLLM.yaml
vendored
Normal file
29
.github/workflows/build-ci-image-vLLM.yaml
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
name: build image cuda vllm
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'oci/base/cuda/vLLM/Dockerfile'
|
||||
- '.github/workflows/build-ci-image-vLLM.yaml'
|
||||
workflow_dispatch:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
IMAGE_REPO: "artifact.svc.plus"
|
||||
|
||||
jobs:
|
||||
build-vllm:
|
||||
name: Build CUDA vLLM image
|
||||
uses: svc-design/actions/.github/workflows/build-images.yaml@main
|
||||
with:
|
||||
method: 'docker'
|
||||
registry_addr: "harbor.onwalk.net"
|
||||
dockerfile_path: 'oci/base/cuda/vLLM'
|
||||
image_name: 'public/base/cuda/vllm'
|
||||
image_tag: 'cuda12'
|
||||
secrets:
|
||||
artifactory_sa: ${{ secrets.REPO_USER }}
|
||||
artifactory_pw: ${{ secrets.HELM_REPO_PASSWORD }}
|
||||
29
oci/base/cuda/Makefile
Normal file
29
oci/base/cuda/Makefile
Normal file
@ -0,0 +1,29 @@
|
||||
ORG ?= your-org
|
||||
REGISTRY ?= ghcr.io/$(ORG)/model-serving
|
||||
VLLM_TAG ?= cuda12
|
||||
SGLANG_TAG ?= cuda12
|
||||
OLLAMA_TAG ?= latest
|
||||
|
||||
.PHONY: docker-build docker-push docker-build-vllm docker-build-sglang docker-build-ollama docker-push-vllm docker-push-sglang docker-push-ollama
|
||||
|
||||
docker-build: docker-build-vllm docker-build-sglang docker-build-ollama
|
||||
|
||||
docker-push: docker-push-vllm docker-push-sglang docker-push-ollama
|
||||
|
||||
docker-build-vllm:
|
||||
docker build -t $(REGISTRY)/vllm:$(VLLM_TAG) vLLM
|
||||
|
||||
docker-build-sglang:
|
||||
docker build -t $(REGISTRY)/sglang:$(SGLANG_TAG) SGLang
|
||||
|
||||
docker-build-ollama:
|
||||
docker build -t $(REGISTRY)/ollama:$(OLLAMA_TAG) Ollama
|
||||
|
||||
docker-push-vllm:
|
||||
docker push $(REGISTRY)/vllm:$(VLLM_TAG)
|
||||
|
||||
docker-push-sglang:
|
||||
docker push $(REGISTRY)/sglang:$(SGLANG_TAG)
|
||||
|
||||
docker-push-ollama:
|
||||
docker push $(REGISTRY)/ollama:$(OLLAMA_TAG)
|
||||
28
oci/base/cuda/Ollama/Dockerfile
Normal file
28
oci/base/cuda/Ollama/Dockerfile
Normal file
@ -0,0 +1,28 @@
|
||||
# Ollama runtime image that leans on host NVIDIA drivers via container toolkit
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates unzip gnupg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | tar -xz -C /usr/local/bin \
|
||||
&& chmod +x /usr/local/bin/ollama
|
||||
|
||||
RUN useradd -m -u 10001 app && mkdir -p /home/app/.ollama && chown -R app:app /home/app
|
||||
USER app
|
||||
|
||||
ENV OLLAMA_HOST=0.0.0.0:11434 \
|
||||
OLLAMA_MODELS=/home/app/.ollama/models \
|
||||
OLLAMA_MODEL="phi3:latest"
|
||||
|
||||
EXPOSE 11434
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:11434/api/tags || exit 1
|
||||
|
||||
ENTRYPOINT ["bash","-lc","set -euo pipefail; \
|
||||
ollama serve & \
|
||||
for i in $(seq 1 30); do sleep 1; curl -fsS http://127.0.0.1:11434/api/tags && break || true; done; \
|
||||
ollama pull \"$OLLAMA_MODEL\" || true; \
|
||||
wait -n"]
|
||||
30
oci/base/cuda/SGLang/Dockerfile
Normal file
30
oci/base/cuda/SGLang/Dockerfile
Normal file
@ -0,0 +1,30 @@
|
||||
# CUDA 12.1 runtime base for SGLang
|
||||
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-venv python3-pip git curl ca-certificates build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV PIP_NO_CACHE_DIR=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
RUN pip3 install --upgrade pip \
|
||||
&& pip3 install --extra-index-url https://download.pytorch.org/whl/cu121 \
|
||||
torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 \
|
||||
&& pip3 install sglang==0.2.3 uvicorn fastapi
|
||||
|
||||
EXPOSE 30000
|
||||
|
||||
ENV SGLANG_MODEL="Qwen/Qwen2-7B-Instruct" \
|
||||
SGLANG_PORT=30000 \
|
||||
SGLANG_ARGS="--tp 1 --context-length 8192"
|
||||
|
||||
RUN useradd -m -u 10001 app && mkdir -p /models && chown -R app:app /models
|
||||
USER app
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:${SGLANG_PORT}/v1/models || exit 1
|
||||
|
||||
ENTRYPOINT ["bash","-lc","python3 -m sglang.launch_server --model \"$SGLANG_MODEL\" --port $SGLANG_PORT --trust-remote-code --enable-openai-compatible-api $SGLANG_ARGS"]
|
||||
33
oci/base/cuda/vLLM/Dockerfile
Normal file
33
oci/base/cuda/vLLM/Dockerfile
Normal file
@ -0,0 +1,33 @@
|
||||
# CUDA 12.1 + cuDNN8 runtime base — tested with recent PyTorch wheels
|
||||
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# System deps + Python
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-venv python3-pip git curl ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV PIP_NO_CACHE_DIR=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Install CUDA-enabled PyTorch + vLLM
|
||||
RUN pip3 install --upgrade pip \
|
||||
&& pip3 install --extra-index-url https://download.pytorch.org/whl/cu121 \
|
||||
torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 \
|
||||
&& pip3 install vllm==0.5.2 uvicorn fastapi
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
ENV MODEL_PATH="meta-llama/Meta-Llama-3-8B-Instruct" \
|
||||
VLLM_ARGS="--max-model-len 8192 --gpu-memory-utilization 0.9" \
|
||||
HF_HOME=/models/.cache \
|
||||
VLLM_WORKER_USE_GRAPH_EXECUTOR=1
|
||||
|
||||
RUN useradd -m -u 10001 app && mkdir -p /models && chown -R app:app /models
|
||||
USER app
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s CMD curl -fsS http://127.0.0.1:8000/v1/models || exit 1
|
||||
|
||||
ENTRYPOINT ["bash","-lc","vllm serve \"$MODEL_PATH\" --port 8000 --api-key dummy $VLLM_ARGS"]
|
||||
35
oci/multi-model-LLM/Makefile
Normal file
35
oci/multi-model-LLM/Makefile
Normal file
@ -0,0 +1,35 @@
|
||||
ORG ?= your-org
|
||||
IMAGE_REGISTRY ?= ghcr.io/$(ORG)/model-serving
|
||||
CHART_NAME ?= model-serving
|
||||
CHART_DIR := charts/$(CHART_NAME)
|
||||
VERSION ?= 0.1.0
|
||||
|
||||
.PHONY: docker-build docker-push helm-lint helm-package helm-push install uninstall template
|
||||
|
||||
docker-build:
|
||||
$(MAKE) -C ../base/cuda docker-build REGISTRY=$(IMAGE_REGISTRY)
|
||||
|
||||
docker-push:
|
||||
$(MAKE) -C ../base/cuda docker-push REGISTRY=$(IMAGE_REGISTRY)
|
||||
|
||||
helm-lint:
|
||||
helm lint $(CHART_DIR)
|
||||
|
||||
helm-package:
|
||||
helm package $(CHART_DIR) --version $(VERSION) --app-version $(VERSION) -d charts/
|
||||
|
||||
helm-push: helm-package
|
||||
helm push charts/$(CHART_NAME)-$(VERSION).tgz oci://ghcr.io/$(ORG)/helm
|
||||
|
||||
RELEASE ?= ms
|
||||
NAMESPACE ?= llm
|
||||
|
||||
install:
|
||||
kubectl create ns $(NAMESPACE) --dry-run=client -o yaml | kubectl apply -f -
|
||||
helm upgrade --install $(RELEASE) $(CHART_DIR) -n $(NAMESPACE)
|
||||
|
||||
uninstall:
|
||||
helm uninstall $(RELEASE) -n $(NAMESPACE) || true
|
||||
|
||||
template:
|
||||
helm template $(RELEASE) $(CHART_DIR)
|
||||
101
oci/multi-model-LLM/README.md
Normal file
101
oci/multi-model-LLM/README.md
Normal file
@ -0,0 +1,101 @@
|
||||
# CUDA LLM Serving — vLLM / SGLang / Ollama (Kubernetes)
|
||||
|
||||
This package bundles three CUDA-ready images plus a single Helm chart that can serve **multiple models** behind one host with **path-based routing** such as:
|
||||
|
||||
- `https://api.svc.plus/v1/llama3` → vLLM (OpenAI-compatible)
|
||||
- `https://api.svc.plus/v1/qwen2` → SGLang (OpenAI-compatible)
|
||||
- `https://api.svc.plus/v1/phi3` → Ollama `/api/*`
|
||||
|
||||
The Dockerfiles live under [`oci/base/cuda`](../base/cuda/), while the Helm chart is in [`charts/model-serving`](charts/model-serving/).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Kubernetes ≥ 1.25
|
||||
- NVIDIA GPUs on worker nodes + NVIDIA Container Toolkit
|
||||
- Ingress Controller (e.g. NGINX) and TLS secret if using HTTPS
|
||||
- (Optional) GitHub Container Registry (GHCR) for distributing images and charts
|
||||
|
||||
## Build & Publish
|
||||
|
||||
```bash
|
||||
# 1) Build and push images to GHCR (adjust ORG)
|
||||
make -C oci/base/cuda ORG=svc-design docker-build docker-push
|
||||
|
||||
# 2) Lint & package the chart
|
||||
make -C oci/multi-model-LLM helm-lint helm-package VERSION=0.1.0
|
||||
|
||||
# 3) Push chart as OCI to GHCR
|
||||
make -C oci/multi-model-LLM ORG=svc-design VERSION=0.1.0 helm-push
|
||||
```
|
||||
|
||||
> Authenticate GHCR first:
|
||||
>
|
||||
> ```bash
|
||||
> echo $GHCR_TOKEN | docker login ghcr.io -u <GITHUB_USER> --password-stdin
|
||||
> helm registry login ghcr.io -u <GITHUB_USER> -p $GHCR_TOKEN
|
||||
> ```
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
# install into namespace llm with release name ms
|
||||
make -C oci/multi-model-LLM install RELEASE=ms NAMESPACE=llm
|
||||
```
|
||||
|
||||
## Configure Models
|
||||
|
||||
Edit [`charts/model-serving/values.yaml`](charts/model-serving/values.yaml) and extend the `models:` list. Example:
|
||||
|
||||
```yaml
|
||||
models:
|
||||
- name: llama3-8b-vllm
|
||||
engine: vllm
|
||||
image: "model-serving/vllm"
|
||||
tag: "cuda12"
|
||||
path: v1/llama3
|
||||
env:
|
||||
- name: MODEL_PATH
|
||||
value: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
- name: VLLM_ARGS
|
||||
value: --max-model-len 8192 --gpu-memory-utilization 0.9
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
|
||||
- name: qwen2-7b-sglang
|
||||
engine: sglang
|
||||
image: "model-serving/sglang"
|
||||
tag: "cuda12"
|
||||
path: v1/qwen2
|
||||
env:
|
||||
- name: SGLANG_MODEL
|
||||
value: Qwen/Qwen2-7B-Instruct
|
||||
|
||||
- name: phi3-ollama
|
||||
engine: ollama
|
||||
image: "model-serving/ollama"
|
||||
tag: latest
|
||||
path: v1/phi3
|
||||
env:
|
||||
- name: OLLAMA_MODEL
|
||||
value: phi3:latest
|
||||
```
|
||||
|
||||
Deployments and services are generated per model, and a single ingress exposes them under unique paths.
|
||||
|
||||
## Runtime Notes
|
||||
|
||||
* **GPU scheduling**: Templates set `runtimeClassName: nvidia` and default GPU limits. Ensure the cluster has the NVIDIA device plugin and RuntimeClass defined, or override `runtimeClassName` per model.
|
||||
* **Storage**: vLLM/SGLang cache defaults to the container filesystem. Mount an external volume by extending the template if persistence is required.
|
||||
* **Authentication**: vLLM launches with a dummy API key. Place an API gateway or ingress authentication in front for production.
|
||||
* **Scaling**: Increase `replicas` per model and add engine-specific flags through environment variables for tensor parallelism or sharding.
|
||||
|
||||
## Uninstall
|
||||
|
||||
```bash
|
||||
make -C oci/multi-model-LLM uninstall RELEASE=ms NAMESPACE=llm
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
6
oci/multi-model-LLM/charts/model-serving/.helmignore
Normal file
6
oci/multi-model-LLM/charts/model-serving/.helmignore
Normal file
@ -0,0 +1,6 @@
|
||||
*.tgz
|
||||
*.swp
|
||||
*.swo
|
||||
.DS_Store
|
||||
.git/
|
||||
.github/
|
||||
14
oci/multi-model-LLM/charts/model-serving/Chart.yaml
Normal file
14
oci/multi-model-LLM/charts/model-serving/Chart.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
apiVersion: v2
|
||||
name: model-serving
|
||||
version: 0.1.0
|
||||
kubeVersion: ">=1.25.0"
|
||||
description: Multi-model LLM serving (vLLM / SGLang / Ollama) with one API host & path routing
|
||||
home: https://github.com/svc-design/artifacts
|
||||
keywords:
|
||||
- llm
|
||||
- vllm
|
||||
- sglang
|
||||
- ollama
|
||||
- cuda
|
||||
- gpu
|
||||
type: application
|
||||
@ -0,0 +1,14 @@
|
||||
{{- define "model-serving.fullname" -}}
|
||||
{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "model-serving.svcname" -}}
|
||||
{{- printf "%s-svc" (include "model-serving.fullname" .) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "model-serving.image" -}}
|
||||
{{- $reg := .Values.global.imageRegistry -}}
|
||||
{{- $img := .image -}}
|
||||
{{- $tag := .tag | default "latest" -}}
|
||||
{{- printf "%s/%s:%s" $reg $img $tag -}}
|
||||
{{- end -}}
|
||||
@ -0,0 +1,58 @@
|
||||
{{- $root := . -}}
|
||||
{{- range $m := .Values.models }}
|
||||
{{- if eq $m.engine "ollama" }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: ollama
|
||||
spec:
|
||||
replicas: {{ $m.replicas | default 1 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: ollama
|
||||
spec:
|
||||
containers:
|
||||
- name: {{ $m.name }}
|
||||
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
{{- if $m.env }}
|
||||
env:
|
||||
{{- range $m.env }}
|
||||
- name: {{ .name }}
|
||||
value: {{ .value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 11434
|
||||
{{- if $m.resources }}
|
||||
resources:
|
||||
{{ toYaml $m.resources | nindent 12 }}
|
||||
{{- else }}
|
||||
resources: {}
|
||||
{{- end }}
|
||||
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
|
||||
{{- if $m.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ toYaml $m.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $m.tolerations }}
|
||||
tolerations:
|
||||
{{ toYaml $m.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $root.Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- range $root.Values.imagePullSecrets }}
|
||||
- name: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
---
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -0,0 +1,58 @@
|
||||
{{- $root := . -}}
|
||||
{{- range $m := .Values.models }}
|
||||
{{- if eq $m.engine "sglang" }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: sglang
|
||||
spec:
|
||||
replicas: {{ $m.replicas | default 1 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: sglang
|
||||
spec:
|
||||
containers:
|
||||
- name: {{ $m.name }}
|
||||
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
{{- if $m.env }}
|
||||
env:
|
||||
{{- range $m.env }}
|
||||
- name: {{ .name }}
|
||||
value: {{ .value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 30000
|
||||
{{- if $m.resources }}
|
||||
resources:
|
||||
{{ toYaml $m.resources | nindent 12 }}
|
||||
{{- else }}
|
||||
resources: {}
|
||||
{{- end }}
|
||||
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
|
||||
{{- if $m.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ toYaml $m.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $m.tolerations }}
|
||||
tolerations:
|
||||
{{ toYaml $m.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $root.Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- range $root.Values.imagePullSecrets }}
|
||||
- name: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
---
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -0,0 +1,58 @@
|
||||
{{- $root := . -}}
|
||||
{{- range $m := .Values.models }}
|
||||
{{- if eq $m.engine "vllm" }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: vllm
|
||||
spec:
|
||||
replicas: {{ $m.replicas | default 1 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
app.kubernetes.io/engine: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: {{ $m.name }}
|
||||
image: {{ include "model-serving.image" (dict "Values" $root.Values "image" $m.image "tag" $m.tag) }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
{{- if $m.env }}
|
||||
env:
|
||||
{{- range $m.env }}
|
||||
- name: {{ .name }}
|
||||
value: {{ .value | quote }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
{{- if $m.resources }}
|
||||
resources:
|
||||
{{ toYaml $m.resources | nindent 12 }}
|
||||
{{- else }}
|
||||
resources: {}
|
||||
{{- end }}
|
||||
runtimeClassName: {{ $m.runtimeClassName | default "nvidia" }}
|
||||
{{- if $m.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ toYaml $m.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $m.tolerations }}
|
||||
tolerations:
|
||||
{{ toYaml $m.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if $root.Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- range $root.Values.imagePullSecrets }}
|
||||
- name: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
---
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -0,0 +1,28 @@
|
||||
{{- if .Values.global.ingress.enabled }}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ include "model-serving.fullname" . }}
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: {{ .Values.global.ingress.className | quote }}
|
||||
spec:
|
||||
{{- if .Values.global.ingress.tls }}
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ .Values.global.ingress.host | quote }}
|
||||
secretName: {{ .Values.global.ingress.tlsSecretName | quote }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ .Values.global.ingress.host | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- range .Values.models }}
|
||||
- path: /{{ .path }}
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "model-serving.fullname" $ }}-{{ .name }}
|
||||
port:
|
||||
number: {{ $.Values.service.port | default 80 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@ -0,0 +1,18 @@
|
||||
{{- $root := . -}}
|
||||
{{- range $m := .Values.models }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "model-serving.fullname" $root }}-{{ $m.name }}
|
||||
labels:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
spec:
|
||||
type: {{ $root.Values.service.type | default "ClusterIP" }}
|
||||
selector:
|
||||
app.kubernetes.io/name: {{ $m.name }}
|
||||
ports:
|
||||
- name: http
|
||||
port: {{ $root.Values.service.port | default 80 }}
|
||||
targetPort: {{ if eq $m.engine "vllm" }}8000{{ else if eq $m.engine "sglang" }}30000{{ else }}11434{{ end }}
|
||||
---
|
||||
{{- end }}
|
||||
61
oci/multi-model-LLM/charts/model-serving/values.yaml
Normal file
61
oci/multi-model-LLM/charts/model-serving/values.yaml
Normal file
@ -0,0 +1,61 @@
|
||||
global:
|
||||
imageRegistry: ghcr.io/your-org
|
||||
namespace: default
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
host: api.svc.plus
|
||||
tls: true
|
||||
tlsSecretName: model-serving-tls
|
||||
|
||||
models:
|
||||
- name: llama3-8b-vllm
|
||||
engine: vllm
|
||||
image: "model-serving/vllm"
|
||||
tag: "cuda12"
|
||||
replicas: 1
|
||||
path: v1/llama3
|
||||
env:
|
||||
- name: MODEL_PATH
|
||||
value: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
- name: VLLM_ARGS
|
||||
value: "--max-model-len 8192 --gpu-memory-utilization 0.9"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
|
||||
- name: qwen2-7b-sglang
|
||||
engine: sglang
|
||||
image: "model-serving/sglang"
|
||||
tag: "cuda12"
|
||||
replicas: 1
|
||||
path: v1/qwen2
|
||||
env:
|
||||
- name: SGLANG_MODEL
|
||||
value: "Qwen/Qwen2-7B-Instruct"
|
||||
- name: SGLANG_ARGS
|
||||
value: "--tp 1 --context-length 8192"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
|
||||
- name: phi3-ollama
|
||||
engine: ollama
|
||||
image: "model-serving/ollama"
|
||||
tag: "latest"
|
||||
replicas: 1
|
||||
path: v1/phi3
|
||||
env:
|
||||
- name: OLLAMA_MODEL
|
||||
value: "phi3:latest"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 80
|
||||
|
||||
imagePullSecrets: []
|
||||
Loading…
Reference in New Issue
Block a user