Improve gpu-k8s role variable handling

2025-06-24 11:26:14 +08:00 · 2025-06-24 11:26:14 +08:00 · 1cfdd1bb1e
commit 1cfdd1bb1e
parent 62a10adc9f
7 changed files with 132 additions and 0 deletions
--- a/docs/gpu-k8s-role.md
+++ b/docs/gpu-k8s-role.md
@ -0,0 +1,60 @@
+# GPU Kubernetes Role
+
+This document describes how to use the `gpu-k8s` role to deploy a simple Kubernetes cluster with NVIDIA GPU support.
+
+## Overview
+
+The role performs three main tasks:
+
+1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes.
+2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources.
+3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload.
+
+The following command is used to create the cluster (example with one master and one worker):
+
+```bash
+sealos run \
+  registry.cn-shanghai.aliyuncs.com/labring/kubernetes:v1.29.9 \
+  registry.cn-shanghai.aliyuncs.com/labring/cilium:v1.13.4 \
+  registry.cn-shanghai.aliyuncs.com/labring/helm:v3.9.4 \
+  --masters 172.16.11.120 \
+  --nodes 172.16.11.152 \
+  --env '{}' \
+  --cmd "kubeadm init --skip-phases=addon/kube-proxy"
+```
+
+After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster.
+
+## Usage
+
+Add the role to your playbook:
+
+```yaml
+- hosts: all
+  roles:
+    - gpu-k8s
+```
+
+Example playbook snippet defining the IP lists:
+
+```yaml
+- hosts: all
+  vars:
+    master_ips:
+      - "172.16.11.120"
+    node_ips:
+      - "172.16.11.152"
+  roles:
+    - gpu-k8s
+```
+
+The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to
+three masters can be specified.
+
+Run the playbook with your inventory that contains these IP addresses.
+
+```bash
+ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml
+```
+
+The final step prints the output of `nvidia-smi` from inside a Kubernetes pod, confirming that the GPU is available.
--- a/playbooks/demo_gpu_k8s.yml
+++ b/playbooks/demo_gpu_k8s.yml
@ -0,0 +1,9 @@
+- hosts: all
+  become: true
+  vars:
+    master_ips:
+      - "172.16.11.120"
+    node_ips:
+      - "172.16.11.152"
+  roles:
+    - gpu-k8s
--- a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
@ -0,0 +1,8 @@
+# Default variables for gpu-k8s role
+sealos_version: v1.29.9
+cilium_version: v1.13.4
+helm_version: v3.9.4
+master_ips: []  # List of up to three master node IPs
+node_ips: []    # List of worker node IPs
+sealos_cmd_env: '{}'
+kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy"
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
@ -0,0 +1,13 @@
+- name: Run sealos to create Kubernetes cluster
+  shell: |
+    sealos run \
+      registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \
+      registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \
+      registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \
+      --masters {{ master_ips | join(',') }} \
+      --nodes {{ node_ips | join(',') }} \
+      --env '{{ sealos_cmd_env }}' \
+      --cmd "{{ kubeadm_init_cmd }}"
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == (master_ips | first)
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml
@ -0,0 +1,21 @@
+- name: Add NVIDIA repository
+  shell: |
+    add-apt-repository -y ppa:graphics-drivers
+    curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
+    distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+    curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
+    apt-get update
+  args:
+    executable: /bin/bash
+  become: true
+
+- name: Install NVIDIA driver and container runtime
+  apt:
+    name:
+      - nvidia-modprobe
+      - nvidia-driver-535
+      - nvidia-headless-535
+      - nvidia-container-runtime
+    state: present
+    update_cache: yes
+  become: true
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml
@ -0,0 +1,8 @@
+- name: Bootstrap Kubernetes cluster with GPU support
+  include_tasks: install_cluster.yml
+
+- name: Install NVIDIA GPU drivers
+  include_tasks: install_driver.yml
+
+- name: Validate GPU access with test workload
+  include_tasks: run_test.yml
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
@ -0,0 +1,13 @@
+- name: Deploy NVIDIA device plugin
+  shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == (master_ips | first)
+
+- name: Run CUDA validation pod
+  shell: |
+    kubectl run gpu-test --image=nvidia/cuda:12.3.2-base-ubuntu22.04 --restart=Never -- nvidia-smi
+    kubectl delete pod gpu-test --wait
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == (master_ips | first)