Merge branch 'main' into te0hq5-codex/使用ansible安装k8s并配置gpu驱动

This commit is contained in:
shenlan 2025-06-24 11:30:37 +08:00 committed by GitHub
commit 4e4541177c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 25 additions and 1 deletions

View File

@ -10,6 +10,7 @@ The role performs three main tasks:
2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources.
3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload.
The following command is used to create the cluster (example with one master and one worker):
```bash
@ -35,6 +36,7 @@ Add the role to your playbook:
- gpu-k8s
```
Example playbook snippet defining the IP lists:
```yaml
@ -51,7 +53,9 @@ Example playbook snippet defining the IP lists:
The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to
three masters can be specified.
Run the playbook with your inventory that contains these IP addresses.
Run the playbook with your inventory that contains the master and node IP addresses.
```bash
ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml

View File

@ -11,3 +11,4 @@
args:
executable: /bin/bash
when: inventory_hostname == (master_ips | first)

View File

@ -7,6 +7,21 @@ curl -sfL https://get.k3s.io | sh -
export INSTALL_K3S_EXEC="server --data-dir=/mnt/opt/rancher/k3s --disable=traefik,servicelb,local-storage --kube-apiserver-arg=service-node-port-range=0-50000 --system-default-registry=registry.cn-hangzhou.aliyuncs.com --flannel-iface=br0"
curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | sh -
# Install IPVS
sudo apt-get -y install ipset ipvsadm
# Install K3s with Calico and kube-proxy in IPVS mode
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--kube-apiserver-arg=service-node-port-range=0-50000 --flannel-iface=br0 --disable traefik,metrics-server,servicelb --disable-cloud-controller --kubelet-arg cloud-provider=external --flannel-backend=none --disable-network-policy" K3S_KUBECONFIG_MODE="644" sh -s - server --kube-proxy-arg proxy-mode=ipvs
# Install Calico
kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml
kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml
# Remove taints in k3s if any (usually happens if started without cloud-manager)
sudo kubectl taint nodes --all node.cloudprovider.kubernetes.io/uninitialized=false:NoSchedule-
# === 设置本地 kubeconfig ===
mkdir -p ~/.kube
cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
@ -21,3 +36,7 @@ until kubectl get pods -A 2>/dev/null | grep -q "coredns.*Running"; do
sleep 3
done
echo "✅ K3s 安装完成kubectl/helm 已就绪"