diff --git a/README.md b/README.md index b6ed6d3..a14b433 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ Additional documentation is stored under the `docs/` folder. | `k3s-addon` | 用于安装 Kubernetes 集群插件。 | | | ✔ | | | | | `secret-manger` | 密钥管理角色,用于管理密钥。 | | | ✔ | | | | | `cert-manager` | 证书管理角色,用于管理证书。 | | | ✔ | | | | +| `ssh-trust` | 配置 ops 主机与节点的 SSH 互信。 | | | ✔ | | | | 表格说明 - Docker:是否属于 Docker 角色。 diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md index 26c493d..0b5a89b 100644 --- a/docs/gpu-k8s-role.md +++ b/docs/gpu-k8s-role.md @@ -4,11 +4,12 @@ This document describes how to use the `gpu-k8s` role to deploy a simple Kuberne ## Overview -The role performs three main tasks: +The role performs four main tasks: 1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes. 2. **Install NVIDIA drivers and the NVIDIA container toolkit** on the target hosts so that Kubernetes can access GPU resources. -3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. +3. **Verify the cluster state** after initialization, displaying the `sealos` version and the current Kubernetes nodes. +4. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. The following command is used to create the cluster (example with one master and one worker): @@ -28,11 +29,12 @@ After the cluster is running the role installs the NVIDIA device plugin and runs ## Usage -Add the role to your playbook: +Add the role to your playbook along with the `ssh-trust` role which configures passwordless access from the ops host to the cluster nodes: ```yaml - hosts: all roles: + - ssh-trust - gpu-k8s ``` @@ -47,6 +49,7 @@ Example playbook snippet defining the IP lists: node_ips: - "172.16.11.152" roles: + - ssh-trust - gpu-k8s ``` diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml index c7f638a..85fa1ca 100644 --- a/playbooks/demo_gpu_k8s.yml +++ b/playbooks/demo_gpu_k8s.yml @@ -8,5 +8,6 @@ - "k8s-2" - "k8s-3" roles: + - roles/vhosts/ssh-trust/ - roles/vhosts/gpu-k8s/ - roles/vhosts/common/ diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml index 6c5940e..07e8191 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -18,3 +18,22 @@ args: executable: /bin/bash when: inventory_hostname == (ops_host | default(master_ips | first)) + +- name: Show sealos CLI version + command: sealos version + register: sealos_cli_version + changed_when: false + when: inventory_hostname == (ops_host | default(master_ips | first)) + +- name: Display Kubernetes cluster status + shell: kubectl get nodes -o wide + args: + executable: /bin/bash + register: k8s_status + changed_when: false + when: inventory_hostname == (ops_host | default(master_ips | first)) + +- name: Print cluster nodes + debug: + msg: "{{ k8s_status.stdout }}" + when: k8s_status is defined and inventory_hostname == (ops_host | default(master_ips | first)) diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml index 8cf35e7..6366def 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -2,7 +2,7 @@ shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml args: executable: /bin/bash - when: inventory_hostname == (master_ips | first) + when: master_ips|length > 0 and inventory_hostname == (master_ips | first) - name: Run CUDA validation pod shell: | @@ -10,5 +10,5 @@ kubectl delete pod gpu-test --wait args: executable: /bin/bash - when: inventory_hostname == (master_ips | first) + when: master_ips|length > 0 and inventory_hostname == (master_ips | first) diff --git a/playbooks/roles/vhosts/ssh-trust/tasks/main.yml b/playbooks/roles/vhosts/ssh-trust/tasks/main.yml new file mode 100644 index 0000000..9609d2a --- /dev/null +++ b/playbooks/roles/vhosts/ssh-trust/tasks/main.yml @@ -0,0 +1,20 @@ +- name: Ensure SSH key pair exists on ops host + shell: | + test -f ~/.ssh/id_rsa || ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N '' -q + args: + executable: /bin/bash + delegate_to: "{{ ops_host }}" + run_once: true + +- name: Fetch ops host public key + slurp: + src: "~/.ssh/id_rsa.pub" + register: ops_pub_key + delegate_to: "{{ ops_host }}" + run_once: true + +- name: Authorize ops host key on other nodes + ansible.builtin.authorized_key: + user: "{{ ansible_user | default('root') }}" + key: "{{ ops_pub_key.content | b64decode }}" + when: inventory_hostname != ops_host