diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md new file mode 100644 index 0000000..45c9eea --- /dev/null +++ b/docs/gpu-k8s-role.md @@ -0,0 +1,60 @@ +# GPU Kubernetes Role + +This document describes how to use the `gpu-k8s` role to deploy a simple Kubernetes cluster with NVIDIA GPU support. + +## Overview + +The role performs three main tasks: + +1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes. +2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. +3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. + +The following command is used to create the cluster (example with one master and one worker): + +```bash +sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:v1.29.9 \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:v1.13.4 \ + registry.cn-shanghai.aliyuncs.com/labring/helm:v3.9.4 \ + --masters 172.16.11.120 \ + --nodes 172.16.11.152 \ + --env '{}' \ + --cmd "kubeadm init --skip-phases=addon/kube-proxy" +``` + +After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster. + +## Usage + +Add the role to your playbook: + +```yaml +- hosts: all + roles: + - gpu-k8s +``` + +Example playbook snippet defining the IP lists: + +```yaml +- hosts: all + vars: + master_ips: + - "172.16.11.120" + node_ips: + - "172.16.11.152" + roles: + - gpu-k8s +``` + +The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to +three masters can be specified. + +Run the playbook with your inventory that contains these IP addresses. + +```bash +ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml +``` + +The final step prints the output of `nvidia-smi` from inside a Kubernetes pod, confirming that the GPU is available. diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml new file mode 100644 index 0000000..980bab6 --- /dev/null +++ b/playbooks/demo_gpu_k8s.yml @@ -0,0 +1,9 @@ +- hosts: all + become: true + vars: + master_ips: + - "172.16.11.120" + node_ips: + - "172.16.11.152" + roles: + - gpu-k8s diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml new file mode 100644 index 0000000..dfe4eaf --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml @@ -0,0 +1,8 @@ +# Default variables for gpu-k8s role +sealos_version: v1.29.9 +cilium_version: v1.13.4 +helm_version: v3.9.4 +master_ips: [] # List of up to three master node IPs +node_ips: [] # List of worker node IPs +sealos_cmd_env: '{}' +kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy" diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml new file mode 100644 index 0000000..e001dd4 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -0,0 +1,13 @@ +- name: Run sealos to create Kubernetes cluster + shell: | + sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \ + --masters {{ master_ips | join(',') }} \ + --nodes {{ node_ips | join(',') }} \ + --env '{{ sealos_cmd_env }}' \ + --cmd "{{ kubeadm_init_cmd }}" + args: + executable: /bin/bash + when: inventory_hostname == (master_ips | first) diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml new file mode 100644 index 0000000..d18656d --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml @@ -0,0 +1,21 @@ +- name: Add NVIDIA repository + shell: | + add-apt-repository -y ppa:graphics-drivers + curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + apt-get update + args: + executable: /bin/bash + become: true + +- name: Install NVIDIA driver and container runtime + apt: + name: + - nvidia-modprobe + - nvidia-driver-535 + - nvidia-headless-535 + - nvidia-container-runtime + state: present + update_cache: yes + become: true diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml new file mode 100644 index 0000000..63cc9c9 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml @@ -0,0 +1,8 @@ +- name: Bootstrap Kubernetes cluster with GPU support + include_tasks: install_cluster.yml + +- name: Install NVIDIA GPU drivers + include_tasks: install_driver.yml + +- name: Validate GPU access with test workload + include_tasks: run_test.yml diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml new file mode 100644 index 0000000..9ff04d2 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -0,0 +1,13 @@ +- name: Deploy NVIDIA device plugin + shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml + args: + executable: /bin/bash + when: inventory_hostname == (master_ips | first) + +- name: Run CUDA validation pod + shell: | + kubectl run gpu-test --image=nvidia/cuda:12.3.2-base-ubuntu22.04 --restart=Never -- nvidia-smi + kubectl delete pod gpu-test --wait + args: + executable: /bin/bash + when: inventory_hostname == (master_ips | first)