diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md index e8d4eb6..37b65e1 100644 --- a/docs/gpu-k8s-role.md +++ b/docs/gpu-k8s-role.md @@ -10,7 +10,8 @@ The role performs three main tasks: 2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. 3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. -The following command is used to create the cluster: + +The following command is used to create the cluster (example with one master and one worker): ```bash sealos run \ @@ -35,8 +36,27 @@ Add the role to your playbook: - gpu-k8s ``` + +Example playbook snippet defining the IP lists: + +```yaml +- hosts: all + vars: + master_ips: + - "172.16.11.120" + node_ips: + - "172.16.11.152" + roles: + - gpu-k8s +``` + +The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to +three masters can be specified. + + Run the playbook with your inventory that contains the master and node IP addresses. + ```bash ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml ``` diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml index 270765a..980bab6 100644 --- a/playbooks/demo_gpu_k8s.yml +++ b/playbooks/demo_gpu_k8s.yml @@ -1,4 +1,9 @@ - hosts: all become: true + vars: + master_ips: + - "172.16.11.120" + node_ips: + - "172.16.11.152" roles: - gpu-k8s diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml index 92c84eb..dfe4eaf 100644 --- a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml +++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml @@ -2,7 +2,7 @@ sealos_version: v1.29.9 cilium_version: v1.13.4 helm_version: v3.9.4 -master_ip: "172.16.11.120" -node_ip: "172.16.11.152" +master_ips: [] # List of up to three master node IPs +node_ips: [] # List of worker node IPs sealos_cmd_env: '{}' kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy" diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml index caa7eb9..e001dd4 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -4,10 +4,10 @@ registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \ registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \ registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \ - --masters {{ master_ip }} \ - --nodes {{ node_ip }} \ + --masters {{ master_ips | join(',') }} \ + --nodes {{ node_ips | join(',') }} \ --env '{{ sealos_cmd_env }}' \ --cmd "{{ kubeadm_init_cmd }}" args: executable: /bin/bash - when: inventory_hostname == master_ip + when: inventory_hostname == (master_ips | first) diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml index c0fd468..8cf35e7 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -2,7 +2,7 @@ shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml args: executable: /bin/bash - when: inventory_hostname == master_ip + when: inventory_hostname == (master_ips | first) - name: Run CUDA validation pod shell: | @@ -10,4 +10,5 @@ kubectl delete pod gpu-test --wait args: executable: /bin/bash - when: inventory_hostname == master_ip + when: inventory_hostname == (master_ips | first) +