Add non-root deployment support for gpu-k8s role

This commit is contained in:
shenlan 2025-06-26 09:16:11 +08:00
parent 3a6d50db3a
commit 74464a22be
4 changed files with 21 additions and 2 deletions

View File

@ -30,6 +30,8 @@ sealos run \
--env '{}' \
--cmd "kubeadm init --skip-phases=addon/kube-proxy"
```
If deploying with a non-root user the command also requires `--ssh-user` and
`--ssh-private-key` options pointing to the user's key.
After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster.
@ -45,17 +47,23 @@ Add the role to your playbook along with the `ssh-trust` role which configures p
```
By default the SSH key is created for the same user Ansible connects with. You
can override this by setting `ssh_user`:
can override this by setting `ssh_user`. When `ansible_user` is defined it will
be used automatically, otherwise `root` is assumed. The role also allows you to
specify the private key path via `ssh_private_key`:
```yaml
- hosts: all
vars:
ssh_user: ubuntu
ssh_private_key: /home/ubuntu/.ssh/myuser_id_rsa
roles:
- ssh-trust
- gpu-k8s
```
The specified user must be able to log in without a password and have sudo
access on the target hosts.
Example playbook snippet defining the IP lists:

View File

@ -10,3 +10,4 @@ node_ips: [] # List of worker node IPs
sealos_cmd_env: '{}'
kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy"
ssh_user: "{{ ansible_user | default('root') }}"
ssh_private_key: "{{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }}"

View File

@ -35,6 +35,7 @@
mv sealos /usr/bin/sealos
args:
executable: /bin/bash
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Install Helm
@ -45,6 +46,7 @@
rm -f get_helm.sh
args:
executable: /bin/bash
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first)) and (ansible_facts['distribution'] == 'Ubuntu' or ansible_facts['distribution'] == 'Debian')
- name: Install nerdctl
@ -53,6 +55,7 @@
tar -C /usr/local -xzf nerdctl-2.0.2-linux-amd64.tar.gz
args:
executable: /bin/bash
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Determine LabRing registry prefix
@ -75,7 +78,7 @@
- name: Verify passwordless SSH access to all cluster nodes
shell: >-
ssh -o BatchMode=yes -o StrictHostKeyChecking=no \
-i {{ ansible_ssh_private_key_file | default('~/.ssh/id_rsa') }} \
-i {{ ssh_private_key }} \
{{ ssh_user | default(ansible_ssh_user | default(ansible_user, true) | default('root')) }}@{{ item }} hostname
loop: "{{ master_ips + node_ips }}"
delegate_to: "{{ ops_host | default(masters | default(master_ips) | first) }}"
@ -92,16 +95,20 @@
{{ labring_registry.stdout }}/helm:{{ helm_version }} \
--masters {{ master_ips | join(',') }} \
--nodes {{ node_ips | join(',') }} \
--ssh-user {{ ssh_user }} \
--ssh-private-key {{ ssh_private_key }} \
--env '{{ sealos_cmd_env }}' \
--cmd "{{ kubeadm_init_cmd }}"
args:
executable: /bin/bash
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Show sealos CLI version
command: sealos version
register: sealos_cli_version
changed_when: false
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Display Kubernetes cluster status
@ -110,6 +117,7 @@
executable: /bin/bash
register: k8s_status
changed_when: false
become: true
when: inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Print cluster nodes

View File

@ -2,6 +2,7 @@
shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
args:
executable: /bin/bash
become: true
when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first))
- name: Run CUDA validation pod
@ -10,5 +11,6 @@
kubectl delete pod gpu-test --wait
args:
executable: /bin/bash
become: true
when: master_ips|length > 0 and inventory_hostname == (ops_host | default(masters | default(master_ips) | first))