From 1179e78d2c7fc16b4892b77993265f1515a3ab7f Mon Sep 17 00:00:00 2001 From: shenlan Date: Thu, 29 May 2025 13:55:46 +0800 Subject: [PATCH 1/2] Update setup-k3s-cluster-with-br0.sh --- .../k3s-cluster/setup-k3s-cluster-with-br0.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh index de5c433..6977ac9 100644 --- a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh +++ b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh @@ -7,6 +7,21 @@ curl -sfL https://get.k3s.io | sh - export INSTALL_K3S_EXEC="server --data-dir=/mnt/opt/rancher/k3s --disable=traefik,servicelb,local-storage --kube-apiserver-arg=service-node-port-range=0-50000 --system-default-registry=registry.cn-hangzhou.aliyuncs.com --flannel-iface=br0" curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | sh - + +# Install IPVS +sudo apt-get -y install ipset ipvsadm + +# Install K3s with Calico and kube-proxy in IPVS mode +curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--kube-apiserver-arg=service-node-port-range=0-50000 --flannel-iface=br0 --disable traefik,metrics-server,servicelb --disable-cloud-controller --kubelet-arg cloud-provider=external --flannel-backend=none --disable-network-policy" K3S_KUBECONFIG_MODE="644" sh -s - server --kube-proxy-arg proxy-mode=ipvs + +# Install Calico +kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml +kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml + +# Remove taints in k3s if any (usually happens if started without cloud-manager) +sudo kubectl taint nodes --all node.cloudprovider.kubernetes.io/uninitialized=false:NoSchedule- + + # === 设置本地 kubeconfig === mkdir -p ~/.kube cp /etc/rancher/k3s/k3s.yaml ~/.kube/config @@ -21,3 +36,7 @@ until kubectl get pods -A 2>/dev/null | grep -q "coredns.*Running"; do sleep 3 done echo "✅ K3s 安装完成,kubectl/helm 已就绪" + + + + From 3f7b870cc41e78f91a3249dec8cad0030ce554d8 Mon Sep 17 00:00:00 2001 From: shenlan Date: Tue, 24 Jun 2025 10:45:35 +0800 Subject: [PATCH 2/2] Add gpu-k8s ansible role and docs --- docs/gpu-k8s-role.md | 44 +++++++++++++++++++ playbooks/demo_gpu_k8s.yml | 4 ++ .../roles/vhosts/gpu-k8s/defaults/main.yml | 8 ++++ .../vhosts/gpu-k8s/tasks/install_cluster.yml | 13 ++++++ .../vhosts/gpu-k8s/tasks/install_driver.yml | 21 +++++++++ playbooks/roles/vhosts/gpu-k8s/tasks/main.yml | 8 ++++ .../roles/vhosts/gpu-k8s/tasks/run_test.yml | 13 ++++++ 7 files changed, 111 insertions(+) create mode 100644 docs/gpu-k8s-role.md create mode 100644 playbooks/demo_gpu_k8s.yml create mode 100644 playbooks/roles/vhosts/gpu-k8s/defaults/main.yml create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/main.yml create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md new file mode 100644 index 0000000..e8d4eb6 --- /dev/null +++ b/docs/gpu-k8s-role.md @@ -0,0 +1,44 @@ +# GPU Kubernetes Role + +This document describes how to use the `gpu-k8s` role to deploy a simple Kubernetes cluster with NVIDIA GPU support. + +## Overview + +The role performs three main tasks: + +1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes. +2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. +3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. + +The following command is used to create the cluster: + +```bash +sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:v1.29.9 \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:v1.13.4 \ + registry.cn-shanghai.aliyuncs.com/labring/helm:v3.9.4 \ + --masters 172.16.11.120 \ + --nodes 172.16.11.152 \ + --env '{}' \ + --cmd "kubeadm init --skip-phases=addon/kube-proxy" +``` + +After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster. + +## Usage + +Add the role to your playbook: + +```yaml +- hosts: all + roles: + - gpu-k8s +``` + +Run the playbook with your inventory that contains the master and node IP addresses. + +```bash +ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml +``` + +The final step prints the output of `nvidia-smi` from inside a Kubernetes pod, confirming that the GPU is available. diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml new file mode 100644 index 0000000..270765a --- /dev/null +++ b/playbooks/demo_gpu_k8s.yml @@ -0,0 +1,4 @@ +- hosts: all + become: true + roles: + - gpu-k8s diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml new file mode 100644 index 0000000..92c84eb --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml @@ -0,0 +1,8 @@ +# Default variables for gpu-k8s role +sealos_version: v1.29.9 +cilium_version: v1.13.4 +helm_version: v3.9.4 +master_ip: "172.16.11.120" +node_ip: "172.16.11.152" +sealos_cmd_env: '{}' +kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy" diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml new file mode 100644 index 0000000..caa7eb9 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml @@ -0,0 +1,13 @@ +- name: Run sealos to create Kubernetes cluster + shell: | + sealos run \ + registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \ + registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \ + --masters {{ master_ip }} \ + --nodes {{ node_ip }} \ + --env '{{ sealos_cmd_env }}' \ + --cmd "{{ kubeadm_init_cmd }}" + args: + executable: /bin/bash + when: inventory_hostname == master_ip diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml new file mode 100644 index 0000000..d18656d --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml @@ -0,0 +1,21 @@ +- name: Add NVIDIA repository + shell: | + add-apt-repository -y ppa:graphics-drivers + curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + apt-get update + args: + executable: /bin/bash + become: true + +- name: Install NVIDIA driver and container runtime + apt: + name: + - nvidia-modprobe + - nvidia-driver-535 + - nvidia-headless-535 + - nvidia-container-runtime + state: present + update_cache: yes + become: true diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml new file mode 100644 index 0000000..63cc9c9 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml @@ -0,0 +1,8 @@ +- name: Bootstrap Kubernetes cluster with GPU support + include_tasks: install_cluster.yml + +- name: Install NVIDIA GPU drivers + include_tasks: install_driver.yml + +- name: Validate GPU access with test workload + include_tasks: run_test.yml diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml new file mode 100644 index 0000000..c0fd468 --- /dev/null +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -0,0 +1,13 @@ +- name: Deploy NVIDIA device plugin + shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml + args: + executable: /bin/bash + when: inventory_hostname == master_ip + +- name: Run CUDA validation pod + shell: | + kubectl run gpu-test --image=nvidia/cuda:12.3.2-base-ubuntu22.04 --restart=Never -- nvidia-smi + kubectl delete pod gpu-test --wait + args: + executable: /bin/bash + when: inventory_hostname == master_ip