From 1179e78d2c7fc16b4892b77993265f1515a3ab7f Mon Sep 17 00:00:00 2001
From: shenlan <manbuzhe2009@qq.com>
Date: Thu, 29 May 2025 13:55:46 +0800
Subject: [PATCH 1/2] Update setup-k3s-cluster-with-br0.sh

---
 .../k3s-cluster/setup-k3s-cluster-with-br0.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh
index de5c433..6977ac9 100644
--- a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh
+++ b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh
@@ -7,6 +7,21 @@ curl -sfL https://get.k3s.io | sh -
 export INSTALL_K3S_EXEC="server --data-dir=/mnt/opt/rancher/k3s --disable=traefik,servicelb,local-storage --kube-apiserver-arg=service-node-port-range=0-50000 --system-default-registry=registry.cn-hangzhou.aliyuncs.com --flannel-iface=br0"
 curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | sh -
 
+
+# Install IPVS
+sudo apt-get -y install ipset ipvsadm
+
+# Install K3s with Calico and kube-proxy in IPVS mode
+curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--kube-apiserver-arg=service-node-port-range=0-50000 --flannel-iface=br0 --disable traefik,metrics-server,servicelb --disable-cloud-controller --kubelet-arg cloud-provider=external --flannel-backend=none --disable-network-policy" K3S_KUBECONFIG_MODE="644" sh -s - server --kube-proxy-arg proxy-mode=ipvs
+
+# Install Calico
+kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml
+kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml
+
+# Remove taints in k3s if any (usually happens if started without cloud-manager)
+sudo kubectl taint nodes --all node.cloudprovider.kubernetes.io/uninitialized=false:NoSchedule-
+
+
 # === 设置本地 kubeconfig ===
 mkdir -p ~/.kube
 cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
@@ -21,3 +36,7 @@ until kubectl get pods -A 2>/dev/null | grep -q "coredns.*Running"; do
   sleep 3
 done
 echo "✅ K3s 安装完成，kubectl/helm 已就绪"
+
+
+
+

From 3f7b870cc41e78f91a3249dec8cad0030ce554d8 Mon Sep 17 00:00:00 2001
From: shenlan <manbuzhe2009@qq.com>
Date: Tue, 24 Jun 2025 10:45:35 +0800
Subject: [PATCH 2/2] Add gpu-k8s ansible role and docs

---
 docs/gpu-k8s-role.md                          | 44 +++++++++++++++++++
 playbooks/demo_gpu_k8s.yml                    |  4 ++
 .../roles/vhosts/gpu-k8s/defaults/main.yml    |  8 ++++
 .../vhosts/gpu-k8s/tasks/install_cluster.yml  | 13 ++++++
 .../vhosts/gpu-k8s/tasks/install_driver.yml   | 21 +++++++++
 playbooks/roles/vhosts/gpu-k8s/tasks/main.yml |  8 ++++
 .../roles/vhosts/gpu-k8s/tasks/run_test.yml   | 13 ++++++
 7 files changed, 111 insertions(+)
 create mode 100644 docs/gpu-k8s-role.md
 create mode 100644 playbooks/demo_gpu_k8s.yml
 create mode 100644 playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
 create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
 create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml
 create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/main.yml
 create mode 100644 playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml

diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md
new file mode 100644
index 0000000..e8d4eb6
--- /dev/null
+++ b/docs/gpu-k8s-role.md
@@ -0,0 +1,44 @@
+# GPU Kubernetes Role
+
+This document describes how to use the `gpu-k8s` role to deploy a simple Kubernetes cluster with NVIDIA GPU support.
+
+## Overview
+
+The role performs three main tasks:
+
+1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes.
+2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources.
+3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload.
+
+The following command is used to create the cluster:
+
+```bash
+sealos run \
+  registry.cn-shanghai.aliyuncs.com/labring/kubernetes:v1.29.9 \
+  registry.cn-shanghai.aliyuncs.com/labring/cilium:v1.13.4 \
+  registry.cn-shanghai.aliyuncs.com/labring/helm:v3.9.4 \
+  --masters 172.16.11.120 \
+  --nodes 172.16.11.152 \
+  --env '{}' \
+  --cmd "kubeadm init --skip-phases=addon/kube-proxy"
+```
+
+After the cluster is running the role installs the NVIDIA device plugin and runs a test pod to ensure `nvidia-smi` works inside the cluster.
+
+## Usage
+
+Add the role to your playbook:
+
+```yaml
+- hosts: all
+  roles:
+    - gpu-k8s
+```
+
+Run the playbook with your inventory that contains the master and node IP addresses.
+
+```bash
+ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml
+```
+
+The final step prints the output of `nvidia-smi` from inside a Kubernetes pod, confirming that the GPU is available.
diff --git a/playbooks/demo_gpu_k8s.yml b/playbooks/demo_gpu_k8s.yml
new file mode 100644
index 0000000..270765a
--- /dev/null
+++ b/playbooks/demo_gpu_k8s.yml
@@ -0,0 +1,4 @@
+- hosts: all
+  become: true
+  roles:
+    - gpu-k8s
diff --git a/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
new file mode 100644
index 0000000..92c84eb
--- /dev/null
+++ b/playbooks/roles/vhosts/gpu-k8s/defaults/main.yml
@@ -0,0 +1,8 @@
+# Default variables for gpu-k8s role
+sealos_version: v1.29.9
+cilium_version: v1.13.4
+helm_version: v3.9.4
+master_ip: "172.16.11.120"
+node_ip: "172.16.11.152"
+sealos_cmd_env: '{}'
+kubeadm_init_cmd: "kubeadm init --skip-phases=addon/kube-proxy"
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
new file mode 100644
index 0000000..caa7eb9
--- /dev/null
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_cluster.yml
@@ -0,0 +1,13 @@
+- name: Run sealos to create Kubernetes cluster
+  shell: |
+    sealos run \
+      registry.cn-shanghai.aliyuncs.com/labring/kubernetes:{{ sealos_version }} \
+      registry.cn-shanghai.aliyuncs.com/labring/cilium:{{ cilium_version }} \
+      registry.cn-shanghai.aliyuncs.com/labring/helm:{{ helm_version }} \
+      --masters {{ master_ip }} \
+      --nodes {{ node_ip }} \
+      --env '{{ sealos_cmd_env }}' \
+      --cmd "{{ kubeadm_init_cmd }}"
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == master_ip
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml
new file mode 100644
index 0000000..d18656d
--- /dev/null
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml
@@ -0,0 +1,21 @@
+- name: Add NVIDIA repository
+  shell: |
+    add-apt-repository -y ppa:graphics-drivers
+    curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
+    distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+    curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
+    apt-get update
+  args:
+    executable: /bin/bash
+  become: true
+
+- name: Install NVIDIA driver and container runtime
+  apt:
+    name:
+      - nvidia-modprobe
+      - nvidia-driver-535
+      - nvidia-headless-535
+      - nvidia-container-runtime
+    state: present
+    update_cache: yes
+  become: true
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml
new file mode 100644
index 0000000..63cc9c9
--- /dev/null
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/main.yml
@@ -0,0 +1,8 @@
+- name: Bootstrap Kubernetes cluster with GPU support
+  include_tasks: install_cluster.yml
+
+- name: Install NVIDIA GPU drivers
+  include_tasks: install_driver.yml
+
+- name: Validate GPU access with test workload
+  include_tasks: run_test.yml
diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
new file mode 100644
index 0000000..c0fd468
--- /dev/null
+++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml
@@ -0,0 +1,13 @@
+- name: Deploy NVIDIA device plugin
+  shell: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.5/nvidia-device-plugin.yml
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == master_ip
+
+- name: Run CUDA validation pod
+  shell: |
+    kubectl run gpu-test --image=nvidia/cuda:12.3.2-base-ubuntu22.04 --restart=Never -- nvidia-smi
+    kubectl delete pod gpu-test --wait
+  args:
+    executable: /bin/bash
+  when: inventory_hostname == master_ip