From 74464a22bef8c4554a893f578233daf714502fd8 Mon Sep 17 00:00:00 2001
From: shenlan <manbuzhe2009@qq.com>
Date: Thu, 26 Jun 2025 09:16:11 +0800
Subject: [PATCH] Add non-root deployment support for gpu-k8s role

---
 docs/gpu-k8s-role.md                                   | 10 +++++++++-
 playbooks/roles/vhosts/gpu-k8s/defaults/main.yml       |  1 +
 .../roles/vhosts/gpu-k8s/tasks/install_cluster.yml     | 10 +++++++++-
 playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml      |  2 ++
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md
index a0a8df8..3f40606 100644
--- a/docs/gpu-k8s-role.md
+++ b/docs/gpu-k8s-role.md
@@ -30,6 +30,8 @@ sealos run \
   --env '{}' \
   --cmd "kubeadm init --skip-phases=addon/kube-proxy"
 ```
+If deploying with a non-root user the command also requires `--ssh-user` and
+`--ssh-private-key` options pointing to the user's key.
 
 After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster.
 
@@ -45,17 +47,23 @@ Add the role to your playbook along with the `ssh-trust` role which configures p
 ```
 
 By default the SSH key is created for the same user Ansible connects with. You
-can override this by setting `ssh_user`:
+can override this by setting `ssh_user`. When `ansible_user` is defined it will
+be used automatically, otherwise `root` is assumed. The role also allows you to
+specify the private key path via `ssh_private_key`:
 
 ```yaml
 - hosts: all
   vars:
     ssh_user: ubuntu
+    ssh_private_key: /home/ubuntu/.ssh/myuser_id_rsa
   roles:
     - ssh-trust
     - gpu-k8s
 ```
 
+The specified user must be able to log in without a password and have sudo
+access on the target hosts.
+
 
 Example playbook snippet defining the IP lists:
 
diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
index fb86f37..78febbb 100644
--- a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
@@ -10,3 +10,4 @@ node_ips: []    # List of worker node IPs
 sealos_cmd_env: '{}'
 kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy"
 ssh_user: "{{ ansible_user | default('root') }}"
+ssh_private_key: "{{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }}"
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
index 752742f..10e6837 100644
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
@@ -35,6 +35,7 @@
     mv sealos /usr/bin/sealos
   args:
     executable: /bin/bash
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Install Helm
@@ -45,6 +46,7 @@
     rm -f get_helm.sh
   args:
     executable: /bin/bash
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) and (ansible_facts['distribution'] == 'Ubuntu' or ansible_facts['distribution'] == 'Debian')
 
 - name: Install nerdctl
@@ -53,6 +55,7 @@
     tar -C /usr/local -xzf nerdctl-2.0.2-linux-amd64.tar.gz
   args:
     executable: /bin/bash
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Determine LabRing registry prefix
@@ -75,7 +78,7 @@
 - name: Verify passwordless SSH access to all cluster nodes
   shell: >-
     ssh -o BatchMode=yes -o StrictHostKeyChecking=no \
-    -i {{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }} \
+    -i {{ ssh_private_key }} \
     {{ ssh_user | default(ansible_ssh_user | default(ansible_user, true) | default('root')) }}@{{ item }} hostname
   loop: "{{ master_ips + node_ips }}"
   delegate_to: "{{ ops_host | default(masters | default(master_ips) | first) }}"
@@ -92,16 +95,20 @@
       {{ labring_registry.stdout }}/helm:{{ helm_version }} \
       --masters {{ master_ips | join(',') }} \
       --nodes {{ node_ips | join(',') }} \
+      --ssh-user {{ ssh_user }} \
+      --ssh-private-key {{ ssh_private_key }} \
       --env '{{ sealos_cmd_env }}' \
       --cmd "{{ kubeadm_init_cmd }}"
   args:
     executable: /bin/bash
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Show sealos CLI version
   command: sealos version
   register: sealos_cli_version
   changed_when: false
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Display Kubernetes cluster status
@@ -110,6 +117,7 @@
     executable: /bin/bash
   register: k8s_status
   changed_when: false
+  become: true
   when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Print cluster nodes
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
index 76255f4..0054d07 100644
--- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
@@ -2,6 +2,7 @@
   shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
   args:
     executable: /bin/bash
+  become: true
   when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
 
 - name: Run CUDA validation pod
@@ -10,5 +11,6 @@
     kubectl delete pod gpu-test --wait
   args:
     executable: /bin/bash
+  become: true
   when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first))