Merge branch 'main' into te0hq5-codex/使用ansible安装k8s并配置gpu驱动
This commit is contained in:
commit
d29198d1cd
@ -10,6 +10,7 @@ The role performs three main tasks:
|
||||
2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources.
|
||||
3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload.
|
||||
|
||||
|
||||
The following command is used to create the cluster (example with one master and one worker):
|
||||
|
||||
```bash
|
||||
@ -35,6 +36,7 @@ Add the role to your playbook:
|
||||
- gpu-k8s
|
||||
```
|
||||
|
||||
|
||||
Example playbook snippet defining the IP lists:
|
||||
|
||||
```yaml
|
||||
@ -51,7 +53,9 @@ Example playbook snippet defining the IP lists:
|
||||
The playbook expects `master_ips` and `node_ips` variables which are lists of IP addresses. Up to
|
||||
three masters can be specified.
|
||||
|
||||
Run the playbook with your inventory that contains these IP addresses.
|
||||
|
||||
Run the playbook with your inventory that contains the master and node IP addresses.
|
||||
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts/all playbooks/demo_gpu_k8s.yml
|
||||
|
||||
@ -11,3 +11,4 @@
|
||||
args:
|
||||
executable: /bin/bash
|
||||
when: inventory_hostname == (master_ips | first)
|
||||
|
||||
|
||||
@ -7,6 +7,21 @@ curl -sfL https://get.k3s.io | sh -
|
||||
export INSTALL_K3S_EXEC="server --data-dir=/mnt/opt/rancher/k3s --disable=traefik,servicelb,local-storage --kube-apiserver-arg=service-node-port-range=0-50000 --system-default-registry=registry.cn-hangzhou.aliyuncs.com --flannel-iface=br0"
|
||||
curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | sh -
|
||||
|
||||
|
||||
# Install IPVS
|
||||
sudo apt-get -y install ipset ipvsadm
|
||||
|
||||
# Install K3s with Calico and kube-proxy in IPVS mode
|
||||
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--kube-apiserver-arg=service-node-port-range=0-50000 --flannel-iface=br0 --disable traefik,metrics-server,servicelb --disable-cloud-controller --kubelet-arg cloud-provider=external --flannel-backend=none --disable-network-policy" K3S_KUBECONFIG_MODE="644" sh -s - server --kube-proxy-arg proxy-mode=ipvs
|
||||
|
||||
# Install Calico
|
||||
kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/tigera-operator.yaml
|
||||
kubectl $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/custom-resources.yaml
|
||||
|
||||
# Remove taints in k3s if any (usually happens if started without cloud-manager)
|
||||
sudo kubectl taint nodes --all node.cloudprovider.kubernetes.io/uninitialized=false:NoSchedule-
|
||||
|
||||
|
||||
# === 设置本地 kubeconfig ===
|
||||
mkdir -p ~/.kube
|
||||
cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
|
||||
@ -21,3 +36,7 @@ until kubectl get pods -A 2>/dev/null | grep -q "coredns.*Running"; do
|
||||
sleep 3
|
||||
done
|
||||
echo "✅ K3s 安装完成,kubectl/helm 已就绪"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user