From 74464a22bef8c4554a893f578233daf714502fd8 Mon Sep 17 00:00:00 2001 From: shenlan Date: Thu, 26 Jun 2025 09:16:11 +0800 Subject: [PATCH] Add non-root deployment support for gpu-k8s role --- docs/gpu-k8s-role.md | 10 +++++++++- playbooks/roles/vhosts/gpu-k8s/defaults/main.yml | 1 + .../roles/vhosts/gpu-k8s/tasks/install_cluster.yml | 10 +++++++++- playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml | 2 ++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md index a0a8df8..3f40606 100644 --- a/docs/gpu-k8s-role.md +++ b/docs/gpu-k8s-role.md @@ -30,6 +30,8 @@ sealos run \ --env '{}' \ --cmd "kubeadm init --skip-phases=addon/kube-proxy" ``` +If deploying with a non-root user the command also requires `--ssh-user` and +`--ssh-private-key` options pointing to the user's key. After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster. @@ -45,17 +47,23 @@ Add the role to your playbook along with the `ssh-trust` role which configures p ``` By default the SSH key is created for the same user Ansible connects with. You -can override this by setting `ssh_user`: +can override this by setting `ssh_user`. When `ansible_user` is defined it will +be used automatically, otherwise `root` is assumed. The role also allows you to +specify the private key path via `ssh_private_key`: ```yaml - hosts: all vars: ssh_user: ubuntu + ssh_private_key: /home/ubuntu/.ssh/myuser_id_rsa roles: - ssh-trust - gpu-k8s ``` +The specified user must be able to log in without a password and have sudo +access on the target hosts. + Example playbook snippet defining the IP lists: diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml index fb86f37..78febbb 100644 --- a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml +++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml @@ -10,3 +10,4 @@ node_ips: [] # List of worker node IPs sealos_cmd_env: '{}' kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy" ssh_user: "{{ ansible_user | default('root') }}" +ssh_private_key: "{{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }}" diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml index 752742f..10e6837 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -35,6 +35,7 @@ mv sealos /usr/bin/sealos args: executable: /bin/bash + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Install Helm @@ -45,6 +46,7 @@ rm -f get_helm.sh args: executable: /bin/bash + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) and (ansible_facts['distribution'] == 'Ubuntu' or ansible_facts['distribution'] == 'Debian') - name: Install nerdctl @@ -53,6 +55,7 @@ tar -C /usr/local -xzf nerdctl-2.0.2-linux-amd64.tar.gz args: executable: /bin/bash + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Determine LabRing registry prefix @@ -75,7 +78,7 @@ - name: Verify passwordless SSH access to all cluster nodes shell: >- ssh -o BatchMode=yes -o StrictHostKeyChecking=no \ - -i {{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }} \ + -i {{ ssh_private_key }} \ {{ ssh_user | default(ansible_ssh_user | default(ansible_user, true) | default('root')) }}@{{ item }} hostname loop: "{{ master_ips + node_ips }}" delegate_to: "{{ ops_host | default(masters | default(master_ips) | first) }}" @@ -92,16 +95,20 @@ {{ labring_registry.stdout }}/helm:{{ helm_version }} \ --masters {{ master_ips | join(',') }} \ --nodes {{ node_ips | join(',') }} \ + --ssh-user {{ ssh_user }} \ + --ssh-private-key {{ ssh_private_key }} \ --env '{{ sealos_cmd_env }}' \ --cmd "{{ kubeadm_init_cmd }}" args: executable: /bin/bash + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Show sealos CLI version command: sealos version register: sealos_cli_version changed_when: false + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Display Kubernetes cluster status @@ -110,6 +117,7 @@ executable: /bin/bash register: k8s_status changed_when: false + become: true when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Print cluster nodes diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml index 76255f4..0054d07 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -2,6 +2,7 @@ shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml args: executable: /bin/bash + become: true when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) - name: Run CUDA validation pod @@ -10,5 +11,6 @@ kubectl delete pod gpu-test --wait args: executable: /bin/bash + become: true when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first))