From 515ba95c75d3e23b2ded9e8d6dff649e318ecda6 Mon Sep 17 00:00:00 2001 From: Haitao Pan Date: Thu, 23 Apr 2026 19:17:23 +0800 Subject: [PATCH] feat(gpu_inference): add comprehensive GPU inference infrastructure with Sealos, Ray, and vLLM --- gpu_inference_01_prepare.yml | 8 +++ gpu_inference_02_sealos.yml | 7 ++ gpu_inference_03_gpu_operator.yml | 6 ++ gpu_inference_04_ray.yml | 7 ++ gpu_inference_05_vllm.yml | 7 ++ gpu_inference_site.yml | 6 ++ inventory/group_vars/all.yml | 27 ++++++++ inventory/group_vars/gpu_workers.yml | 1 + inventory/group_vars/masters.yml | 1 + inventory/group_vars/ray_workers.yml | 1 + inventory/hosts.ini | 13 ++++ .../nvidia_gpu_operator/defaults/main.yml | 15 +++++ .../nvidia_gpu_operator/handlers/main.yml | 1 + .../charts/nvidia_gpu_operator/tasks/main.yml | 28 ++++++++ .../nvidia_gpu_operator/tasks/validate.yml | 15 +++++ .../templates/values.yaml.j2 | 15 +++++ roles/charts/ray_cluster/defaults/main.yml | 36 ++++++++++ roles/charts/ray_cluster/handlers/main.yml | 1 + roles/charts/ray_cluster/tasks/main.yml | 24 +++++++ .../ray_cluster/templates/raycluster.yaml.j2 | 53 +++++++++++++++ roles/charts/ray_service/tasks/main.yml | 1 + roles/charts/vllm_runtime/tasks/main.yml | 1 + roles/charts/vllm_service/defaults/main.yml | 31 +++++++++ roles/charts/vllm_service/handlers/main.yml | 1 + roles/charts/vllm_service/tasks/main.yml | 36 ++++++++++ .../vllm_service/templates/deployment.yaml.j2 | 65 +++++++++++++++++++ .../vllm_service/templates/ingress.yaml.j2 | 17 +++++ .../templates/rayservice_vllm.yaml.j2 | 51 +++++++++++++++ .../vllm_service/templates/service.yaml.j2 | 14 ++++ roles/docker/container_runtime/tasks/main.yml | 1 + roles/vhosts/cni_cilium/tasks/main.yml | 1 + roles/vhosts/kernel_tuning/tasks/main.yml | 12 ++++ roles/vhosts/sealos_cluster/defaults/main.yml | 10 +++ roles/vhosts/sealos_cluster/handlers/main.yml | 1 + roles/vhosts/sealos_cluster/tasks/main.yml | 40 ++++++++++++ .../sealos_cluster/templates/Clusterfile.j2 | 15 +++++ .../templates/clusterfile.yaml.j2 | 15 +++++ roles/vhosts/validation/tasks/main.yml | 1 + 38 files changed, 585 insertions(+) create mode 100644 gpu_inference_01_prepare.yml create mode 100644 gpu_inference_02_sealos.yml create mode 100644 gpu_inference_03_gpu_operator.yml create mode 100644 gpu_inference_04_ray.yml create mode 100644 gpu_inference_05_vllm.yml create mode 100644 gpu_inference_site.yml create mode 100644 inventory/group_vars/all.yml create mode 100644 inventory/group_vars/gpu_workers.yml create mode 100644 inventory/group_vars/masters.yml create mode 100644 inventory/group_vars/ray_workers.yml create mode 100644 inventory/hosts.ini create mode 100644 roles/charts/nvidia_gpu_operator/defaults/main.yml create mode 100644 roles/charts/nvidia_gpu_operator/handlers/main.yml create mode 100644 roles/charts/nvidia_gpu_operator/tasks/main.yml create mode 100644 roles/charts/nvidia_gpu_operator/tasks/validate.yml create mode 100644 roles/charts/nvidia_gpu_operator/templates/values.yaml.j2 create mode 100644 roles/charts/ray_cluster/defaults/main.yml create mode 100644 roles/charts/ray_cluster/handlers/main.yml create mode 100644 roles/charts/ray_cluster/tasks/main.yml create mode 100644 roles/charts/ray_cluster/templates/raycluster.yaml.j2 create mode 100644 roles/charts/ray_service/tasks/main.yml create mode 100644 roles/charts/vllm_runtime/tasks/main.yml create mode 100644 roles/charts/vllm_service/defaults/main.yml create mode 100644 roles/charts/vllm_service/handlers/main.yml create mode 100644 roles/charts/vllm_service/tasks/main.yml create mode 100644 roles/charts/vllm_service/templates/deployment.yaml.j2 create mode 100644 roles/charts/vllm_service/templates/ingress.yaml.j2 create mode 100644 roles/charts/vllm_service/templates/rayservice_vllm.yaml.j2 create mode 100644 roles/charts/vllm_service/templates/service.yaml.j2 create mode 100644 roles/docker/container_runtime/tasks/main.yml create mode 100644 roles/vhosts/cni_cilium/tasks/main.yml create mode 100644 roles/vhosts/kernel_tuning/tasks/main.yml create mode 100644 roles/vhosts/sealos_cluster/defaults/main.yml create mode 100644 roles/vhosts/sealos_cluster/handlers/main.yml create mode 100644 roles/vhosts/sealos_cluster/tasks/main.yml create mode 100644 roles/vhosts/sealos_cluster/templates/Clusterfile.j2 create mode 100644 roles/vhosts/sealos_cluster/templates/clusterfile.yaml.j2 create mode 100644 roles/vhosts/validation/tasks/main.yml diff --git a/gpu_inference_01_prepare.yml b/gpu_inference_01_prepare.yml new file mode 100644 index 0000000..a0d7a26 --- /dev/null +++ b/gpu_inference_01_prepare.yml @@ -0,0 +1,8 @@ +--- +- name: Prepare Host Environment + hosts: all + become: true + roles: + - roles/vhosts/common + - roles/vhosts/kernel_tuning + - roles/docker/container_runtime diff --git a/gpu_inference_02_sealos.yml b/gpu_inference_02_sealos.yml new file mode 100644 index 0000000..2809712 --- /dev/null +++ b/gpu_inference_02_sealos.yml @@ -0,0 +1,7 @@ +--- +- name: Install Kubernetes via Sealos + hosts: masters + become: true + roles: + - roles/vhosts/sealos_cluster + - roles/vhosts/cni_cilium diff --git a/gpu_inference_03_gpu_operator.yml b/gpu_inference_03_gpu_operator.yml new file mode 100644 index 0000000..c2dd683 --- /dev/null +++ b/gpu_inference_03_gpu_operator.yml @@ -0,0 +1,6 @@ +--- +- name: Install NVIDIA GPU Operator + hosts: masters[0] + become: true + roles: + - roles/charts/nvidia_gpu_operator diff --git a/gpu_inference_04_ray.yml b/gpu_inference_04_ray.yml new file mode 100644 index 0000000..5d47b80 --- /dev/null +++ b/gpu_inference_04_ray.yml @@ -0,0 +1,7 @@ +--- +- name: Deploy Ray Cluster + hosts: masters[0] + become: true + roles: + - roles/charts/ray_cluster + - roles/charts/ray_service diff --git a/gpu_inference_05_vllm.yml b/gpu_inference_05_vllm.yml new file mode 100644 index 0000000..6fc9be6 --- /dev/null +++ b/gpu_inference_05_vllm.yml @@ -0,0 +1,7 @@ +--- +- name: Deploy vLLM Inference Service + hosts: masters[0] + become: true + roles: + - roles/charts/vllm_runtime + - roles/charts/vllm_service diff --git a/gpu_inference_site.yml b/gpu_inference_site.yml new file mode 100644 index 0000000..988adee --- /dev/null +++ b/gpu_inference_site.yml @@ -0,0 +1,6 @@ +--- +- import_playbook: gpu_inference_01_prepare.yml +- import_playbook: gpu_inference_02_sealos.yml +- import_playbook: gpu_inference_03_gpu_operator.yml +- import_playbook: gpu_inference_04_ray.yml +- import_playbook: gpu_inference_05_vllm.yml diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml new file mode 100644 index 0000000..8226845 --- /dev/null +++ b/inventory/group_vars/all.yml @@ -0,0 +1,27 @@ +--- +# 全局版本与镜像 +kubernetes_version: "v1.28.9" +sealos_version: "5.0.0" +cilium_version: "1.15.5" +gpu_operator_version: "v24.3.0" +kuberay_version: "1.1.0" +ray_version: "2.9.0" +vllm_image: "vllm/vllm-openai:v0.4.2" + +# 网络配置 +pod_cidr: "10.244.0.0/16" +service_cidr: "10.96.0.0/12" +nccl_socket_ifname: "eth0" +gloo_socket_ifname: "eth0" + +# 模型与推理配置 +vllm_model: "/models/Llama-3-70B-Instruct" +vllm_tensor_parallel_size: 2 +vllm_pipeline_parallel_size: 1 + +# GPU 驱动策略 +driver_enabled: true +driver_version: "535.129.03" +dcgm_exporter_enabled: true + +ansible_user: "root" diff --git a/inventory/group_vars/gpu_workers.yml b/inventory/group_vars/gpu_workers.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/inventory/group_vars/gpu_workers.yml @@ -0,0 +1 @@ +--- diff --git a/inventory/group_vars/masters.yml b/inventory/group_vars/masters.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/inventory/group_vars/masters.yml @@ -0,0 +1 @@ +--- diff --git a/inventory/group_vars/ray_workers.yml b/inventory/group_vars/ray_workers.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/inventory/group_vars/ray_workers.yml @@ -0,0 +1 @@ +--- diff --git a/inventory/hosts.ini b/inventory/hosts.ini new file mode 100644 index 0000000..ec7a830 --- /dev/null +++ b/inventory/hosts.ini @@ -0,0 +1,13 @@ +[masters] +k8s-master-01 ansible_host=10.0.0.10 + +[gpu_workers] +k8s-gpu-01 ansible_host=10.0.0.21 accelerator=nvidia-h100 +k8s-gpu-02 ansible_host=10.0.0.22 accelerator=nvidia-h100 + +[ray_workers:children] +gpu_workers + +[k8s_cluster:children] +masters +gpu_workers diff --git a/roles/charts/nvidia_gpu_operator/defaults/main.yml b/roles/charts/nvidia_gpu_operator/defaults/main.yml new file mode 100644 index 0000000..4ee4b3f --- /dev/null +++ b/roles/charts/nvidia_gpu_operator/defaults/main.yml @@ -0,0 +1,15 @@ +--- +gpu_operator_namespace: "gpu-operator" +gpu_operator_release_name: "gpu-operator" +gpu_operator_chart_version: "v24.3.0" + +# Air-gapped / Private registry support +gpu_operator_repository: "https://helm.ngc.nvidia.com/nvidia" +image_pull_secrets: [] + +# Operator settings +driver_enabled: true +driver_version: "535.129.03" +toolkit_enabled: true +mig_strategy: "single" # none, single, mixed +dcgm_exporter_enabled: true diff --git a/roles/charts/nvidia_gpu_operator/handlers/main.yml b/roles/charts/nvidia_gpu_operator/handlers/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/charts/nvidia_gpu_operator/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/charts/nvidia_gpu_operator/tasks/main.yml b/roles/charts/nvidia_gpu_operator/tasks/main.yml new file mode 100644 index 0000000..0807fa9 --- /dev/null +++ b/roles/charts/nvidia_gpu_operator/tasks/main.yml @@ -0,0 +1,28 @@ +--- +- name: Create GPU Operator namespace + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: "{{ gpu_operator_namespace }}" + state: present + when: inventory_hostname == groups['masters'][0] + +- name: Add NVIDIA helm repo + kubernetes.core.helm_repository: + name: nvidia + repo_url: "{{ gpu_operator_repository }}" + when: inventory_hostname == groups['masters'][0] + +- name: Deploy GPU Operator + kubernetes.core.helm: + name: "{{ gpu_operator_release_name }}" + chart_ref: nvidia/gpu-operator + release_namespace: "{{ gpu_operator_namespace }}" + version: "{{ gpu_operator_chart_version }}" + values: "{{ lookup('template', 'values.yaml.j2') | from_yaml }}" + wait: true + when: inventory_hostname == groups['masters'][0] + +- name: Include validation tasks + include_tasks: validate.yml + when: inventory_hostname == groups['masters'][0] diff --git a/roles/charts/nvidia_gpu_operator/tasks/validate.yml b/roles/charts/nvidia_gpu_operator/tasks/validate.yml new file mode 100644 index 0000000..f1a1036 --- /dev/null +++ b/roles/charts/nvidia_gpu_operator/tasks/validate.yml @@ -0,0 +1,15 @@ +--- +- name: Wait for NVIDIA Device Plugin daemonset to be ready + shell: | + kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n {{ gpu_operator_namespace }} --timeout=300s + register: ds_status + changed_when: false + +- name: Validate GPU resources are allocatable + shell: | + kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[*].status.allocatable}' + register: gpu_allocatable + until: "'nvidia.com/gpu' in gpu_allocatable.stdout" + retries: 30 + delay: 20 + changed_when: false diff --git a/roles/charts/nvidia_gpu_operator/templates/values.yaml.j2 b/roles/charts/nvidia_gpu_operator/templates/values.yaml.j2 new file mode 100644 index 0000000..0b98ea0 --- /dev/null +++ b/roles/charts/nvidia_gpu_operator/templates/values.yaml.j2 @@ -0,0 +1,15 @@ +driver: + enabled: {{ driver_enabled }} + version: "{{ driver_version }}" +toolkit: + enabled: {{ toolkit_enabled }} +devicePlugin: + enabled: true +mig: + strategy: "{{ mig_strategy }}" +dcgmExporter: + enabled: {{ dcgm_exporter_enabled }} +{% if image_pull_secrets | length > 0 %} +imagePullSecrets: +{{ image_pull_secrets | to_nice_yaml(indent=2) | indent(2, true) }} +{% endif %} diff --git a/roles/charts/ray_cluster/defaults/main.yml b/roles/charts/ray_cluster/defaults/main.yml new file mode 100644 index 0000000..7b581d8 --- /dev/null +++ b/roles/charts/ray_cluster/defaults/main.yml @@ -0,0 +1,36 @@ +--- +ray_namespace: "ray-system" +ray_cluster_name: "ray-cluster" +ray_image: "rayproject/ray:2.9.0" +ray_version: "2.9.0" + +ray_dashboard_enabled: true + +ray_head_resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "4" + memory: "16Gi" + +ray_worker_groups: + - groupName: gpu-workers + replicas: 2 + minReplicas: 1 + maxReplicas: 4 + resources: + requests: + cpu: "4" + memory: "32Gi" + nvidia.com/gpu: "1" + limits: + cpu: "8" + memory: "64Gi" + nvidia.com/gpu: "1" + nodeSelector: + accelerator: "nvidia-h100" + tolerations: [] + volumeMounts: + - mountPath: /dev/shm + name: dshm diff --git a/roles/charts/ray_cluster/handlers/main.yml b/roles/charts/ray_cluster/handlers/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/charts/ray_cluster/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/charts/ray_cluster/tasks/main.yml b/roles/charts/ray_cluster/tasks/main.yml new file mode 100644 index 0000000..20ff5fc --- /dev/null +++ b/roles/charts/ray_cluster/tasks/main.yml @@ -0,0 +1,24 @@ +--- +- name: Create Ray namespace + kubernetes.core.k8s: + name: "{{ ray_namespace }}" + api_version: v1 + kind: Namespace + state: present + when: inventory_hostname == groups['masters'][0] + +- name: Apply RayCluster CRD + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'raycluster.yaml.j2') | from_yaml }}" + when: inventory_hostname == groups['masters'][0] + +- name: Wait for Ray head node to be ready + shell: | + kubectl get pod -n {{ ray_namespace }} -l ray.io/node-type=head -o jsonpath='{.items[0].status.phase}' + register: head_status + until: head_status.stdout == "Running" + retries: 30 + delay: 10 + changed_when: false + when: inventory_hostname == groups['masters'][0] diff --git a/roles/charts/ray_cluster/templates/raycluster.yaml.j2 b/roles/charts/ray_cluster/templates/raycluster.yaml.j2 new file mode 100644 index 0000000..83566af --- /dev/null +++ b/roles/charts/ray_cluster/templates/raycluster.yaml.j2 @@ -0,0 +1,53 @@ +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: {{ ray_cluster_name }} + namespace: {{ ray_namespace }} +spec: + rayVersion: '{{ ray_version }}' + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' +{% if not ray_dashboard_enabled %} + dashboard-enabled: 'false' +{% endif %} + template: + spec: + containers: + - name: ray-head + image: {{ ray_image }} + resources: +{{ ray_head_resources | to_nice_yaml(indent=4) | indent(12, true) }} + workerGroupSpecs: +{% for group in ray_worker_groups %} + - groupName: {{ group.groupName }} + replicas: {{ group.replicas }} + minReplicas: {{ group.minReplicas }} + maxReplicas: {{ group.maxReplicas }} + rayStartParams: {} + template: + spec: +{% if group.nodeSelector is defined %} + nodeSelector: +{{ group.nodeSelector | to_nice_yaml(indent=2) | indent(10, true) }} +{% endif %} +{% if group.tolerations is defined and group.tolerations | length > 0 %} + tolerations: +{{ group.tolerations | to_nice_yaml(indent=2) | indent(10, true) }} +{% endif %} + containers: + - name: ray-worker + image: {{ ray_image }} + resources: +{{ group.resources | to_nice_yaml(indent=4) | indent(12, true) }} +{% if group.volumeMounts is defined and group.volumeMounts | length > 0 %} + volumeMounts: +{{ group.volumeMounts | to_nice_yaml(indent=2) | indent(10, true) }} +{% endif %} +{% if group.volumeMounts is defined and group.volumeMounts | selectattr('name', 'equalto', 'dshm') | list | length > 0 %} + volumes: + - name: dshm + emptyDir: + medium: Memory +{% endif %} +{% endfor %} diff --git a/roles/charts/ray_service/tasks/main.yml b/roles/charts/ray_service/tasks/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/charts/ray_service/tasks/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/charts/vllm_runtime/tasks/main.yml b/roles/charts/vllm_runtime/tasks/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/charts/vllm_runtime/tasks/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/charts/vllm_service/defaults/main.yml b/roles/charts/vllm_service/defaults/main.yml new file mode 100644 index 0000000..8771aae --- /dev/null +++ b/roles/charts/vllm_service/defaults/main.yml @@ -0,0 +1,31 @@ +--- +vllm_namespace: "vllm-system" +vllm_service_name: "vllm-api" +vllm_image: "vllm/vllm-openai:v0.4.2" +vllm_model: "/models/Llama-3-70B-Instruct" + +vllm_tensor_parallel_size: 2 +vllm_pipeline_parallel_size: 1 +vllm_gpu_memory_utilization: 0.90 +vllm_max_model_len: 4096 +vllm_max_num_seqs: 256 + +vllm_port: 8000 +vllm_service_type: "ClusterIP" +vllm_ingress_enabled: false +vllm_ingress_host: "vllm.example.com" + +# Ray Integration +ray_address: "ray://ray-cluster-head-svc.ray-system.svc.cluster.local:10001" + +# Environment Variables +nccl_socket_ifname: "eth0" +gloo_socket_ifname: "eth0" +nccl_ib_disable: "1" +vllm_logging_level: "INFO" +torch_distributed_init_timeout: "300" +huggingface_token: "" + +# Model Mount +model_host_path: "/data/models" +model_mount_path: "/models" diff --git a/roles/charts/vllm_service/handlers/main.yml b/roles/charts/vllm_service/handlers/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/charts/vllm_service/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/charts/vllm_service/tasks/main.yml b/roles/charts/vllm_service/tasks/main.yml new file mode 100644 index 0000000..4e7c0c3 --- /dev/null +++ b/roles/charts/vllm_service/tasks/main.yml @@ -0,0 +1,36 @@ +--- +- name: Create vLLM namespace + kubernetes.core.k8s: + name: "{{ vllm_namespace }}" + api_version: v1 + kind: Namespace + state: present + when: inventory_hostname == groups['masters'][0] + +- name: Deploy vLLM Deployment + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'deployment.yaml.j2') | from_yaml }}" + when: inventory_hostname == groups['masters'][0] + +- name: Deploy vLLM Service + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'service.yaml.j2') | from_yaml }}" + when: inventory_hostname == groups['masters'][0] + +- name: Deploy vLLM Ingress + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'ingress.yaml.j2') | from_yaml }}" + when: inventory_hostname == groups['masters'][0] and vllm_ingress_enabled + +- name: Wait for vLLM API to be ready + shell: | + kubectl get deploy {{ vllm_service_name }} -n {{ vllm_namespace }} -o jsonpath='{.status.readyReplicas}' + register: vllm_ready + until: vllm_ready.stdout == "1" + retries: 40 + delay: 15 + changed_when: false + when: inventory_hostname == groups['masters'][0] diff --git a/roles/charts/vllm_service/templates/deployment.yaml.j2 b/roles/charts/vllm_service/templates/deployment.yaml.j2 new file mode 100644 index 0000000..1e55dcf --- /dev/null +++ b/roles/charts/vllm_service/templates/deployment.yaml.j2 @@ -0,0 +1,65 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ vllm_service_name }} + namespace: {{ vllm_namespace }} +spec: + replicas: 1 + selector: + matchLabels: + app: {{ vllm_service_name }} + template: + metadata: + labels: + app: {{ vllm_service_name }} + spec: + containers: + - name: vllm + image: {{ vllm_image }} + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model={{ vllm_model }}" + - "--tensor-parallel-size={{ vllm_tensor_parallel_size }}" + - "--pipeline-parallel-size={{ vllm_pipeline_parallel_size }}" + - "--gpu-memory-utilization={{ vllm_gpu_memory_utilization }}" + - "--max-model-len={{ vllm_max_model_len }}" + - "--max-num-seqs={{ vllm_max_num_seqs }}" + - "--port={{ vllm_port }}" + - "--distributed-executor-backend=ray" + - "--worker-use-ray" + env: + - name: RAY_ADDRESS + value: "{{ ray_address }}" + - name: NCCL_SOCKET_IFNAME + value: "{{ nccl_socket_ifname }}" + - name: GLOO_SOCKET_IFNAME + value: "{{ gloo_socket_ifname }}" + - name: NCCL_IB_DISABLE + value: "{{ nccl_ib_disable }}" + - name: VLLM_LOGGING_LEVEL + value: "{{ vllm_logging_level }}" + - name: TORCH_DISTRIBUTED_INIT_TIMEOUT + value: "{{ torch_distributed_init_timeout }}" + - name: HUGGING_FACE_HUB_TOKEN + value: "{{ huggingface_token }}" + ports: + - containerPort: {{ vllm_port }} + readinessProbe: + httpGet: + path: /health + port: {{ vllm_port }} + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: models + mountPath: {{ model_mount_path }} + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: models + hostPath: + path: {{ model_host_path }} + type: DirectoryOrCreate diff --git a/roles/charts/vllm_service/templates/ingress.yaml.j2 b/roles/charts/vllm_service/templates/ingress.yaml.j2 new file mode 100644 index 0000000..66d837d --- /dev/null +++ b/roles/charts/vllm_service/templates/ingress.yaml.j2 @@ -0,0 +1,17 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ vllm_service_name }}-ingress + namespace: {{ vllm_namespace }} +spec: + rules: + - host: {{ vllm_ingress_host }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ vllm_service_name }} + port: + number: {{ vllm_port }} diff --git a/roles/charts/vllm_service/templates/rayservice_vllm.yaml.j2 b/roles/charts/vllm_service/templates/rayservice_vllm.yaml.j2 new file mode 100644 index 0000000..a98f630 --- /dev/null +++ b/roles/charts/vllm_service/templates/rayservice_vllm.yaml.j2 @@ -0,0 +1,51 @@ +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: {{ service_name }} + namespace: default +spec: + serveConfigV2: | + applications: + - name: vllm_app + import_path: "vllm.engine.arg_utils:AsyncEngineArgs" + route_prefix: / + rayClusterConfig: + rayVersion: {{ kuberay_version }} + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:{{ kuberay_version }} + workerGroupSpecs: + - replicas: {{ worker_replicas }} + minReplicas: 1 + maxReplicas: {{ max_replicas }} + groupName: gpu-group + rayStartParams: {} + template: + spec: + containers: + - name: vllm-node + image: {{ vllm_image }} + resources: + limits: + nvidia.com/gpu: {{ gpus_per_worker }} + env: + - name: HUGGING_FACE_HUB_TOKEN + value: "{{ huggingface_token }}" + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "{{ model_name_or_path }}" + - "--tensor-parallel-size" + - "{{ tensor_parallel_size }}" + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory diff --git a/roles/charts/vllm_service/templates/service.yaml.j2 b/roles/charts/vllm_service/templates/service.yaml.j2 new file mode 100644 index 0000000..4c3bb68 --- /dev/null +++ b/roles/charts/vllm_service/templates/service.yaml.j2 @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ vllm_service_name }} + namespace: {{ vllm_namespace }} +spec: + type: {{ vllm_service_type }} + ports: + - port: {{ vllm_port }} + targetPort: {{ vllm_port }} + protocol: TCP + name: http + selector: + app: {{ vllm_service_name }} diff --git a/roles/docker/container_runtime/tasks/main.yml b/roles/docker/container_runtime/tasks/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/docker/container_runtime/tasks/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/vhosts/cni_cilium/tasks/main.yml b/roles/vhosts/cni_cilium/tasks/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/vhosts/cni_cilium/tasks/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/vhosts/kernel_tuning/tasks/main.yml b/roles/vhosts/kernel_tuning/tasks/main.yml new file mode 100644 index 0000000..0e8ae1c --- /dev/null +++ b/roles/vhosts/kernel_tuning/tasks/main.yml @@ -0,0 +1,12 @@ +--- +- name: Set max_map_count for vLLM/Ray + sysctl: + name: vm.max_map_count + value: '262144' + state: present + reload: yes + +- name: Load ip_vs module + modprobe: + name: ip_vs + state: present diff --git a/roles/vhosts/sealos_cluster/defaults/main.yml b/roles/vhosts/sealos_cluster/defaults/main.yml new file mode 100644 index 0000000..e569e01 --- /dev/null +++ b/roles/vhosts/sealos_cluster/defaults/main.yml @@ -0,0 +1,10 @@ +--- +sealos_version: "5.0.0" +kubernetes_version: "v1.28.9" +pod_cidr: "10.244.0.0/16" +service_cidr: "10.96.0.0/12" +kubeconfig_local_path: "./admin.conf" +cluster_name: "default" +masters: "{{ groups['masters'] | default([]) }}" +workers: "{{ groups['gpu_workers'] | default([]) }}" +ansible_ssh_user: "root" diff --git a/roles/vhosts/sealos_cluster/handlers/main.yml b/roles/vhosts/sealos_cluster/handlers/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/vhosts/sealos_cluster/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/vhosts/sealos_cluster/tasks/main.yml b/roles/vhosts/sealos_cluster/tasks/main.yml new file mode 100644 index 0000000..c432112 --- /dev/null +++ b/roles/vhosts/sealos_cluster/tasks/main.yml @@ -0,0 +1,40 @@ +--- +# README: +# 前置条件:节点操作系统为兼容 Linux,节点间网络互通,已配置免密登录。 +# 执行顺序:仅在 masters 组的第一台节点执行,通过 sealos apply 自动同步到远端节点。 + +- name: Create sealos workspace + file: + path: /etc/sealos + state: directory + mode: '0755' + when: inventory_hostname == masters[0] + +- name: Generate Clusterfile + template: + src: clusterfile.yaml.j2 + dest: /etc/sealos/Clusterfile + when: inventory_hostname == masters[0] + +- name: Apply Sealos Cluster (Idempotent) + command: "sealos apply -f /etc/sealos/Clusterfile" + register: sealos_apply + changed_when: "'Applied' in sealos_apply.stdout" + failed_when: sealos_apply.rc != 0 + when: inventory_hostname == masters[0] + +- name: Wait for nodes to be ready + shell: "kubectl get nodes --no-headers | grep -v 'NotReady' | wc -l" + register: ready_nodes + until: ready_nodes.stdout | int == (masters | length + workers | length) + retries: 30 + delay: 10 + changed_when: false + when: inventory_hostname == masters[0] + +- name: Fetch kubeconfig to local + fetch: + src: /etc/kubernetes/admin.conf + dest: "{{ kubeconfig_local_path }}" + flat: yes + when: inventory_hostname == masters[0] diff --git a/roles/vhosts/sealos_cluster/templates/Clusterfile.j2 b/roles/vhosts/sealos_cluster/templates/Clusterfile.j2 new file mode 100644 index 0000000..69f9f04 --- /dev/null +++ b/roles/vhosts/sealos_cluster/templates/Clusterfile.j2 @@ -0,0 +1,15 @@ +apiVersion: cluster.sealos.io/v1 +kind: Cluster +metadata: + name: {{ cluster_name }} +spec: + hosts: + - ips: {{ groups['masters'] | map('extract', hostvars, ['ansible_host']) | list | to_json }} + roles: [master] + - ips: {{ groups['gpu_workers'] | map('extract', hostvars, ['ansible_host']) | list | to_json }} + roles: [node] + ssh: + user: {{ ansible_user | default('root') }} + network: + podCIDR: {{ pod_cidr }} + svcCIDR: {{ service_cidr }} diff --git a/roles/vhosts/sealos_cluster/templates/clusterfile.yaml.j2 b/roles/vhosts/sealos_cluster/templates/clusterfile.yaml.j2 new file mode 100644 index 0000000..0a96c65 --- /dev/null +++ b/roles/vhosts/sealos_cluster/templates/clusterfile.yaml.j2 @@ -0,0 +1,15 @@ +apiVersion: cluster.sealos.io/v1 +kind: Cluster +metadata: + name: {{ cluster_name }} +spec: + hosts: + - ips: {{ masters | map('extract', hostvars, ['ansible_host']) | list | to_json }} + roles: [master] + - ips: {{ workers | map('extract', hostvars, ['ansible_host']) | list | to_json }} + roles: [node] + ssh: + user: {{ ansible_ssh_user }} + network: + podCIDR: {{ pod_cidr }} + svcCIDR: {{ service_cidr }} diff --git a/roles/vhosts/validation/tasks/main.yml b/roles/vhosts/validation/tasks/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/vhosts/validation/tasks/main.yml @@ -0,0 +1 @@ +---