diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md new file mode 100644 index 0000000..e8d4eb6 --- /dev/null +++ b/docs/gpu-k8s-role.md @@ -0,0 +1,44 @@ +# GPU Kubernetes Role + +This document describes how to use the `gpu-k8s` role to deploy a simple Kubernetes cluster with NVIDIA GPU support. + +## Overview + +The role performs three main tasks: + +1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes. +2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. +3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. + +The following command is used to create the cluster: + +```bash +sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:v1.29.9 \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:v1.13.4 \ + registry.cn-shanghai.aliyuncs.com/labring/helm:v3.9.4 \ + --masters 172.16.11.120 \ + --nodes 172.16.11.152 \ + --env '{}' \ + --cmd "kubeadm init --skip-phases=addon/kube-proxy" +``` + +After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster. + +## Usage + +Add the role to your playbook: + +```yaml +- hosts: all + roles: + - gpu-k8s +``` + +Run the playbook with your inventory that contains the master and node IP addresses. + +```bash +ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml +``` + +The final step prints the output of `nvidia-smi` from inside a Kubernetes pod, confirming that the GPU is available. diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml new file mode 100644 index 0000000..270765a --- /dev/null +++ b/playbooks/demo_gpu_k8s.yml @@ -0,0 +1,4 @@ +- hosts: all + become: true + roles: + - gpu-k8s diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml new file mode 100644 index 0000000..92c84eb --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml @@ -0,0 +1,8 @@ +# Default variables for gpu-k8s role +sealos_version: v1.29.9 +cilium_version: v1.13.4 +helm_version: v3.9.4 +master_ip: "172.16.11.120" +node_ip: "172.16.11.152" +sealos_cmd_env: '{}' +kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy" diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml new file mode 100644 index 0000000..caa7eb9 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -0,0 +1,13 @@ +- name: Run sealos to create Kubernetes cluster + shell: | + sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \ + --masters {{ master_ip }} \ + --nodes {{ node_ip }} \ + --env '{{ sealos_cmd_env }}' \ + --cmd "{{ kubeadm_init_cmd }}" + args: + executable: /bin/bash + when: inventory_hostname == master_ip diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml new file mode 100644 index 0000000..d18656d --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml @@ -0,0 +1,21 @@ +- name: Add NVIDIA repository + shell: | + add-apt-repository -y ppa:graphics-drivers + curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + apt-get update + args: + executable: /bin/bash + become: true + +- name: Install NVIDIA driver and container runtime + apt: + name: + - nvidia-modprobe + - nvidia-driver-535 + - nvidia-headless-535 + - nvidia-container-runtime + state: present + update_cache: yes + become: true diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml new file mode 100644 index 0000000..63cc9c9 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml @@ -0,0 +1,8 @@ +- name: Bootstrap Kubernetes cluster with GPU support + include_tasks: install_cluster.yml + +- name: Install NVIDIA GPU drivers + include_tasks: install_driver.yml + +- name: Validate GPU access with test workload + include_tasks: run_test.yml diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml new file mode 100644 index 0000000..c0fd468 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -0,0 +1,13 @@ +- name: Deploy NVIDIA device plugin + shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml + args: + executable: /bin/bash + when: inventory_hostname == master_ip + +- name: Run CUDA validation pod + shell: | + kubectl run gpu-test --image=nvidia/cuda:12.3.2-base-ubuntu22.04 --restart=Never -- nvidia-smi + kubectl delete pod gpu-test --wait + args: + executable: /bin/bash + when: inventory_hostname == master_ip