diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md index 45c9eea..37b65e1 100644 --- a/docs/gpu-k8s-role.md +++ b/docs/gpu-k8s-role.md @@ -10,6 +10,7 @@ The role performs three main tasks: 2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. 3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. + The following command is used to create the cluster (example with one master and one worker): ```bash @@ -35,6 +36,7 @@ Add the role to your playbook: - gpu-k8s ``` + Example playbook snippet defining the IP lists: ```yaml @@ -51,7 +53,9 @@ Example playbook snippet defining the IP lists: The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to three masters can be specified. -Run the playbook with your inventory that contains these IP addresses. + +Run the playbook with your inventory that contains the master and node IP addresses. + ```bash ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml index 9ff04d2..8cf35e7 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/run_test.yml @@ -11,3 +11,4 @@ args: executable: /bin/bash when: inventory_hostname == (master_ips | first) + diff --git a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh index de5c433..6977ac9 100644 --- a/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh +++ b/scripts/k3s-cluster/setup-k3s-cluster-with-br0.sh @@ -7,6 +7,21 @@ curl -sfL https://get.k3s.io | sh - export INSTALL_K3S_EXEC="server --data-dir=/mnt/opt/rancher/k3s --disable=traefik,servicelb,local-storage --kube-apiserver-arg=service-node-port-range=0-50000 --system-default-registry=registry.cn-hangzhou.aliyuncs.com --flannel-iface=br0" curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | sh - + +# Install IPVS +sudo apt-get -y install ipset ipvsadm + +# Install K3s with Calico and kube-proxy in IPVS mode +curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--kube-apiserver-arg=service-node-port-range=0-50000 --flannel-iface=br0 --disable traefik,metrics-server,servicelb --disable-cloud-controller --kubelet-arg cloud-provider=external --flannel-backend=none --disable-network-policy" K3S_KUBECONFIG_MODE="644" sh -s - server --kube-proxy-arg proxy-mode=ipvs + +# Install Calico +kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml +kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml + +# Remove taints in k3s if any (usually happens if started without cloud-manager) +sudo kubectl taint nodes --all node.cloudprovider.kubernetes.io/uninitialized=false:NoSchedule- + + # === 设置本地 kubeconfig === mkdir -p ~/.kube cp /etc/rancher/k3s/k3s.yaml ~/.kube/config @@ -21,3 +36,7 @@ until kubectl get pods -A 2>/dev/null | grep -q "coredns.*Running"; do sleep 3 done echo "✅ K3s 安装完成,kubectl/helm 已就绪" + + + +