feat(gpu_inference): add comprehensive GPU inference infrastructure with Sealos, Ray, and vLLM

This commit is contained in:
Haitao Pan 2026-04-23 19:17:23 +08:00
parent 413d46995b
commit 515ba95c75
38 changed files with 585 additions and 0 deletions

View File

@ -0,0 +1,8 @@
---
- name: Prepare Host Environment
hosts: all
become: true
roles:
- roles/vhosts/common
- roles/vhosts/kernel_tuning
- roles/docker/container_runtime

View File

@ -0,0 +1,7 @@
---
- name: Install Kubernetes via Sealos
hosts: masters
become: true
roles:
- roles/vhosts/sealos_cluster
- roles/vhosts/cni_cilium

View File

@ -0,0 +1,6 @@
---
- name: Install NVIDIA GPU Operator
hosts: masters[0]
become: true
roles:
- roles/charts/nvidia_gpu_operator

7
gpu_inference_04_ray.yml Normal file
View File

@ -0,0 +1,7 @@
---
- name: Deploy Ray Cluster
hosts: masters[0]
become: true
roles:
- roles/charts/ray_cluster
- roles/charts/ray_service

View File

@ -0,0 +1,7 @@
---
- name: Deploy vLLM Inference Service
hosts: masters[0]
become: true
roles:
- roles/charts/vllm_runtime
- roles/charts/vllm_service

6
gpu_inference_site.yml Normal file
View File

@ -0,0 +1,6 @@
---
- import_playbook: gpu_inference_01_prepare.yml
- import_playbook: gpu_inference_02_sealos.yml
- import_playbook: gpu_inference_03_gpu_operator.yml
- import_playbook: gpu_inference_04_ray.yml
- import_playbook: gpu_inference_05_vllm.yml

View File

@ -0,0 +1,27 @@
---
# 全局版本与镜像
kubernetes_version: "v1.28.9"
sealos_version: "5.0.0"
cilium_version: "1.15.5"
gpu_operator_version: "v24.3.0"
kuberay_version: "1.1.0"
ray_version: "2.9.0"
vllm_image: "vllm/vllm-openai:v0.4.2"
# 网络配置
pod_cidr: "10.244.0.0/16"
service_cidr: "10.96.0.0/12"
nccl_socket_ifname: "eth0"
gloo_socket_ifname: "eth0"
# 模型与推理配置
vllm_model: "/models/Llama-3-70B-Instruct"
vllm_tensor_parallel_size: 2
vllm_pipeline_parallel_size: 1
# GPU 驱动策略
driver_enabled: true
driver_version: "535.129.03"
dcgm_exporter_enabled: true
ansible_user: "root"

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1 @@
---

13
inventory/hosts.ini Normal file
View File

@ -0,0 +1,13 @@
[masters]
k8s-master-01 ansible_host=10.0.0.10
[gpu_workers]
k8s-gpu-01 ansible_host=10.0.0.21 accelerator=nvidia-h100
k8s-gpu-02 ansible_host=10.0.0.22 accelerator=nvidia-h100
[ray_workers:children]
gpu_workers
[k8s_cluster:children]
masters
gpu_workers

View File

@ -0,0 +1,15 @@
---
gpu_operator_namespace: "gpu-operator"
gpu_operator_release_name: "gpu-operator"
gpu_operator_chart_version: "v24.3.0"
# Air-gapped / Private registry support
gpu_operator_repository: "https://helm.ngc.nvidia.com/nvidia"
image_pull_secrets: []
# Operator settings
driver_enabled: true
driver_version: "535.129.03"
toolkit_enabled: true
mig_strategy: "single" # none, single, mixed
dcgm_exporter_enabled: true

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,28 @@
---
- name: Create GPU Operator namespace
kubernetes.core.k8s:
api_version: v1
kind: Namespace
name: "{{ gpu_operator_namespace }}"
state: present
when: inventory_hostname == groups['masters'][0]
- name: Add NVIDIA helm repo
kubernetes.core.helm_repository:
name: nvidia
repo_url: "{{ gpu_operator_repository }}"
when: inventory_hostname == groups['masters'][0]
- name: Deploy GPU Operator
kubernetes.core.helm:
name: "{{ gpu_operator_release_name }}"
chart_ref: nvidia/gpu-operator
release_namespace: "{{ gpu_operator_namespace }}"
version: "{{ gpu_operator_chart_version }}"
values: "{{ lookup('template', 'values.yaml.j2') | from_yaml }}"
wait: true
when: inventory_hostname == groups['masters'][0]
- name: Include validation tasks
include_tasks: validate.yml
when: inventory_hostname == groups['masters'][0]

View File

@ -0,0 +1,15 @@
---
- name: Wait for NVIDIA Device Plugin daemonset to be ready
shell: |
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n {{ gpu_operator_namespace }} --timeout=300s
register: ds_status
changed_when: false
- name: Validate GPU resources are allocatable
shell: |
kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[*].status.allocatable}'
register: gpu_allocatable
until: "'nvidia.com/gpu' in gpu_allocatable.stdout"
retries: 30
delay: 20
changed_when: false

View File

@ -0,0 +1,15 @@
driver:
enabled: {{ driver_enabled }}
version: "{{ driver_version }}"
toolkit:
enabled: {{ toolkit_enabled }}
devicePlugin:
enabled: true
mig:
strategy: "{{ mig_strategy }}"
dcgmExporter:
enabled: {{ dcgm_exporter_enabled }}
{% if image_pull_secrets | length > 0 %}
imagePullSecrets:
{{ image_pull_secrets | to_nice_yaml(indent=2) | indent(2, true) }}
{% endif %}

View File

@ -0,0 +1,36 @@
---
ray_namespace: "ray-system"
ray_cluster_name: "ray-cluster"
ray_image: "rayproject/ray:2.9.0"
ray_version: "2.9.0"
ray_dashboard_enabled: true
ray_head_resources:
requests:
cpu: "2"
memory: "8Gi"
limits:
cpu: "4"
memory: "16Gi"
ray_worker_groups:
- groupName: gpu-workers
replicas: 2
minReplicas: 1
maxReplicas: 4
resources:
requests:
cpu: "4"
memory: "32Gi"
nvidia.com/gpu: "1"
limits:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "1"
nodeSelector:
accelerator: "nvidia-h100"
tolerations: []
volumeMounts:
- mountPath: /dev/shm
name: dshm

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,24 @@
---
- name: Create Ray namespace
kubernetes.core.k8s:
name: "{{ ray_namespace }}"
api_version: v1
kind: Namespace
state: present
when: inventory_hostname == groups['masters'][0]
- name: Apply RayCluster CRD
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'raycluster.yaml.j2') | from_yaml }}"
when: inventory_hostname == groups['masters'][0]
- name: Wait for Ray head node to be ready
shell: |
kubectl get pod -n {{ ray_namespace }} -l ray.io/node-type=head -o jsonpath='{.items[0].status.phase}'
register: head_status
until: head_status.stdout == "Running"
retries: 30
delay: 10
changed_when: false
when: inventory_hostname == groups['masters'][0]

View File

@ -0,0 +1,53 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: {{ ray_cluster_name }}
namespace: {{ ray_namespace }}
spec:
rayVersion: '{{ ray_version }}'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
{% if not ray_dashboard_enabled %}
dashboard-enabled: 'false'
{% endif %}
template:
spec:
containers:
- name: ray-head
image: {{ ray_image }}
resources:
{{ ray_head_resources | to_nice_yaml(indent=4) | indent(12, true) }}
workerGroupSpecs:
{% for group in ray_worker_groups %}
- groupName: {{ group.groupName }}
replicas: {{ group.replicas }}
minReplicas: {{ group.minReplicas }}
maxReplicas: {{ group.maxReplicas }}
rayStartParams: {}
template:
spec:
{% if group.nodeSelector is defined %}
nodeSelector:
{{ group.nodeSelector | to_nice_yaml(indent=2) | indent(10, true) }}
{% endif %}
{% if group.tolerations is defined and group.tolerations | length > 0 %}
tolerations:
{{ group.tolerations | to_nice_yaml(indent=2) | indent(10, true) }}
{% endif %}
containers:
- name: ray-worker
image: {{ ray_image }}
resources:
{{ group.resources | to_nice_yaml(indent=4) | indent(12, true) }}
{% if group.volumeMounts is defined and group.volumeMounts | length > 0 %}
volumeMounts:
{{ group.volumeMounts | to_nice_yaml(indent=2) | indent(10, true) }}
{% endif %}
{% if group.volumeMounts is defined and group.volumeMounts | selectattr('name', 'equalto', 'dshm') | list | length > 0 %}
volumes:
- name: dshm
emptyDir:
medium: Memory
{% endif %}
{% endfor %}

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,31 @@
---
vllm_namespace: "vllm-system"
vllm_service_name: "vllm-api"
vllm_image: "vllm/vllm-openai:v0.4.2"
vllm_model: "/models/Llama-3-70B-Instruct"
vllm_tensor_parallel_size: 2
vllm_pipeline_parallel_size: 1
vllm_gpu_memory_utilization: 0.90
vllm_max_model_len: 4096
vllm_max_num_seqs: 256
vllm_port: 8000
vllm_service_type: "ClusterIP"
vllm_ingress_enabled: false
vllm_ingress_host: "vllm.example.com"
# Ray Integration
ray_address: "ray://ray-cluster-head-svc.ray-system.svc.cluster.local:10001"
# Environment Variables
nccl_socket_ifname: "eth0"
gloo_socket_ifname: "eth0"
nccl_ib_disable: "1"
vllm_logging_level: "INFO"
torch_distributed_init_timeout: "300"
huggingface_token: ""
# Model Mount
model_host_path: "/data/models"
model_mount_path: "/models"

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,36 @@
---
- name: Create vLLM namespace
kubernetes.core.k8s:
name: "{{ vllm_namespace }}"
api_version: v1
kind: Namespace
state: present
when: inventory_hostname == groups['masters'][0]
- name: Deploy vLLM Deployment
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'deployment.yaml.j2') | from_yaml }}"
when: inventory_hostname == groups['masters'][0]
- name: Deploy vLLM Service
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'service.yaml.j2') | from_yaml }}"
when: inventory_hostname == groups['masters'][0]
- name: Deploy vLLM Ingress
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'ingress.yaml.j2') | from_yaml }}"
when: inventory_hostname == groups['masters'][0] and vllm_ingress_enabled
- name: Wait for vLLM API to be ready
shell: |
kubectl get deploy {{ vllm_service_name }} -n {{ vllm_namespace }} -o jsonpath='{.status.readyReplicas}'
register: vllm_ready
until: vllm_ready.stdout == "1"
retries: 40
delay: 15
changed_when: false
when: inventory_hostname == groups['masters'][0]

View File

@ -0,0 +1,65 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ vllm_service_name }}
namespace: {{ vllm_namespace }}
spec:
replicas: 1
selector:
matchLabels:
app: {{ vllm_service_name }}
template:
metadata:
labels:
app: {{ vllm_service_name }}
spec:
containers:
- name: vllm
image: {{ vllm_image }}
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model={{ vllm_model }}"
- "--tensor-parallel-size={{ vllm_tensor_parallel_size }}"
- "--pipeline-parallel-size={{ vllm_pipeline_parallel_size }}"
- "--gpu-memory-utilization={{ vllm_gpu_memory_utilization }}"
- "--max-model-len={{ vllm_max_model_len }}"
- "--max-num-seqs={{ vllm_max_num_seqs }}"
- "--port={{ vllm_port }}"
- "--distributed-executor-backend=ray"
- "--worker-use-ray"
env:
- name: RAY_ADDRESS
value: "{{ ray_address }}"
- name: NCCL_SOCKET_IFNAME
value: "{{ nccl_socket_ifname }}"
- name: GLOO_SOCKET_IFNAME
value: "{{ gloo_socket_ifname }}"
- name: NCCL_IB_DISABLE
value: "{{ nccl_ib_disable }}"
- name: VLLM_LOGGING_LEVEL
value: "{{ vllm_logging_level }}"
- name: TORCH_DISTRIBUTED_INIT_TIMEOUT
value: "{{ torch_distributed_init_timeout }}"
- name: HUGGING_FACE_HUB_TOKEN
value: "{{ huggingface_token }}"
ports:
- containerPort: {{ vllm_port }}
readinessProbe:
httpGet:
path: /health
port: {{ vllm_port }}
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
- name: models
mountPath: {{ model_mount_path }}
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: models
hostPath:
path: {{ model_host_path }}
type: DirectoryOrCreate

View File

@ -0,0 +1,17 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ vllm_service_name }}-ingress
namespace: {{ vllm_namespace }}
spec:
rules:
- host: {{ vllm_ingress_host }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ vllm_service_name }}
port:
number: {{ vllm_port }}

View File

@ -0,0 +1,51 @@
apiVersion: ray.io/v1
kind: RayService
metadata:
name: {{ service_name }}
namespace: default
spec:
serveConfigV2: |
applications:
- name: vllm_app
import_path: "vllm.engine.arg_utils:AsyncEngineArgs"
route_prefix: /
rayClusterConfig:
rayVersion: {{ kuberay_version }}
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:{{ kuberay_version }}
workerGroupSpecs:
- replicas: {{ worker_replicas }}
minReplicas: 1
maxReplicas: {{ max_replicas }}
groupName: gpu-group
rayStartParams: {}
template:
spec:
containers:
- name: vllm-node
image: {{ vllm_image }}
resources:
limits:
nvidia.com/gpu: {{ gpus_per_worker }}
env:
- name: HUGGING_FACE_HUB_TOKEN
value: "{{ huggingface_token }}"
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "{{ model_name_or_path }}"
- "--tensor-parallel-size"
- "{{ tensor_parallel_size }}"
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory

View File

@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: {{ vllm_service_name }}
namespace: {{ vllm_namespace }}
spec:
type: {{ vllm_service_type }}
ports:
- port: {{ vllm_port }}
targetPort: {{ vllm_port }}
protocol: TCP
name: http
selector:
app: {{ vllm_service_name }}

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,12 @@
---
- name: Set max_map_count for vLLM/Ray
sysctl:
name: vm.max_map_count
value: '262144'
state: present
reload: yes
- name: Load ip_vs module
modprobe:
name: ip_vs
state: present

View File

@ -0,0 +1,10 @@
---
sealos_version: "5.0.0"
kubernetes_version: "v1.28.9"
pod_cidr: "10.244.0.0/16"
service_cidr: "10.96.0.0/12"
kubeconfig_local_path: "./admin.conf"
cluster_name: "default"
masters: "{{ groups['masters'] | default([]) }}"
workers: "{{ groups['gpu_workers'] | default([]) }}"
ansible_ssh_user: "root"

View File

@ -0,0 +1 @@
---

View File

@ -0,0 +1,40 @@
---
# README:
# 前置条件:节点操作系统为兼容 Linux节点间网络互通已配置免密登录。
# 执行顺序:仅在 masters 组的第一台节点执行,通过 sealos apply 自动同步到远端节点。
- name: Create sealos workspace
file:
path: /etc/sealos
state: directory
mode: '0755'
when: inventory_hostname == masters[0]
- name: Generate Clusterfile
template:
src: clusterfile.yaml.j2
dest: /etc/sealos/Clusterfile
when: inventory_hostname == masters[0]
- name: Apply Sealos Cluster (Idempotent)
command: "sealos apply -f /etc/sealos/Clusterfile"
register: sealos_apply
changed_when: "'Applied' in sealos_apply.stdout"
failed_when: sealos_apply.rc != 0
when: inventory_hostname == masters[0]
- name: Wait for nodes to be ready
shell: "kubectl get nodes --no-headers | grep -v 'NotReady' | wc -l"
register: ready_nodes
until: ready_nodes.stdout | int == (masters | length + workers | length)
retries: 30
delay: 10
changed_when: false
when: inventory_hostname == masters[0]
- name: Fetch kubeconfig to local
fetch:
src: /etc/kubernetes/admin.conf
dest: "{{ kubeconfig_local_path }}"
flat: yes
when: inventory_hostname == masters[0]

View File

@ -0,0 +1,15 @@
apiVersion: cluster.sealos.io/v1
kind: Cluster
metadata:
name: {{ cluster_name }}
spec:
hosts:
- ips: {{ groups['masters'] | map('extract', hostvars, ['ansible_host']) | list | to_json }}
roles: [master]
- ips: {{ groups['gpu_workers'] | map('extract', hostvars, ['ansible_host']) | list | to_json }}
roles: [node]
ssh:
user: {{ ansible_user | default('root') }}
network:
podCIDR: {{ pod_cidr }}
svcCIDR: {{ service_cidr }}

View File

@ -0,0 +1,15 @@
apiVersion: cluster.sealos.io/v1
kind: Cluster
metadata:
name: {{ cluster_name }}
spec:
hosts:
- ips: {{ masters | map('extract', hostvars, ['ansible_host']) | list | to_json }}
roles: [master]
- ips: {{ workers | map('extract', hostvars, ['ansible_host']) | list | to_json }}
roles: [node]
ssh:
user: {{ ansible_ssh_user }}
network:
podCIDR: {{ pod_cidr }}
svcCIDR: {{ service_cidr }}

View File

@ -0,0 +1 @@
---