369 lines
12 KiB
Bash
369 lines
12 KiB
Bash
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
# === 全局变量 ===
|
||
MASTER_IP=$(hostname -I | awk '{print $1}')
|
||
USER=${USER:-$(whoami)}
|
||
SSH_KEY="${HOME}/.ssh/id_rsa"
|
||
K8S_VERSION="labring/kubernetes:v1.29.9"
|
||
CILIUM_VERSION="labring/cilium:v1.13.4"
|
||
HELM_VERSION="labring/helm:v3.9.4"
|
||
NVIDIA_DRIVER_VERSION="nvidia-driver-535"
|
||
NVIDIA_PLUGIN_VERSION="v0.17.1"
|
||
NERDCTL_VERSION="2.1.2"
|
||
PROXY_ADDR="http://127.0.0.1:1081"
|
||
USE_PROXY=${USE_PROXY:-false}
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
OFFLINE_DIR=${OFFLINE_DIR:-$SCRIPT_DIR}
|
||
|
||
# 部署模式: kubeadm 或 sealos
|
||
# 默认使用 kubeadm,可通过环境变量 DEPLOY_MODE 覆盖
|
||
DEPLOY_MODE=${DEPLOY_MODE:-kubeadm}
|
||
|
||
# 加载镜像的工具: sealos、nerdctl 或 docker
|
||
# 默认使用 sealos,可通过 IMAGE_LOAD_TOOL 环境变量覆盖
|
||
IMAGE_LOAD_TOOL=${IMAGE_LOAD_TOOL:-sealos}
|
||
|
||
# === 选项代理 ===
|
||
configure_proxy() {
|
||
if [ "$USE_PROXY" = true ]; then
|
||
export http_proxy=$PROXY_ADDR
|
||
export https_proxy=$PROXY_ADDR
|
||
export HTTP_PROXY=$PROXY_ADDR
|
||
export HTTPS_PROXY=$PROXY_ADDR
|
||
echo "🌐 代理已启用: $PROXY_ADDR"
|
||
else
|
||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
|
||
echo "🌐 代理已关闭"
|
||
fi
|
||
}
|
||
|
||
proxy_curl() {
|
||
if [ "$USE_PROXY" = true ]; then
|
||
curl --proxy "$PROXY_ADDR" "$@"
|
||
else
|
||
curl "$@"
|
||
fi
|
||
}
|
||
|
||
load_offline_images() {
|
||
local tar="${OFFLINE_DIR}/images/gpu_k8s_images.tar"
|
||
[ -f "$tar" ] || return 0
|
||
echo "📦 导入离线镜像..."
|
||
case "$IMAGE_LOAD_TOOL" in
|
||
sealos)
|
||
if command -v sealos &>/dev/null; then
|
||
sealos load -i "$tar"
|
||
return
|
||
fi
|
||
;;
|
||
nerdctl)
|
||
if command -v nerdctl &>/dev/null; then
|
||
sudo nerdctl load -i "$tar"
|
||
return
|
||
fi
|
||
;;
|
||
docker)
|
||
if command -v docker &>/dev/null; then
|
||
docker load -i "$tar"
|
||
return
|
||
fi
|
||
;;
|
||
*)
|
||
echo "❌ 不支持的加载工具: $IMAGE_LOAD_TOOL"
|
||
return 1
|
||
;;
|
||
esac
|
||
|
||
echo "❌ 无法找到 $IMAGE_LOAD_TOOL 用于加载镜像"
|
||
return 1
|
||
}
|
||
|
||
install_all_offline_packages() {
|
||
if [ -d "${OFFLINE_DIR}/packages" ]; then
|
||
echo "📦 Using offline deb packages"
|
||
sudo dpkg -i ${OFFLINE_DIR}/packages/*.deb 2>/dev/null || sudo apt-get -f install -y
|
||
return 0
|
||
fi
|
||
return 1
|
||
}
|
||
|
||
install_base() {
|
||
echo "[1/8] 安装基础依赖"
|
||
install_all_offline_packages || {
|
||
sudo apt-get update -y
|
||
sudo apt-get install -y curl gnupg2 ca-certificates lsb-release \
|
||
apt-transport-https software-properties-common openssh-client \
|
||
openssh-server uidmap containerd ${NVIDIA_DRIVER_VERSION} nvidia-container-toolkit
|
||
}
|
||
}
|
||
|
||
install_containerd() {
|
||
echo "[2/8] 安装 containerd + nerdctl"
|
||
sudo apt-get purge -y docker.io docker-ce docker-ce-cli containerd.io || true
|
||
if ! install_all_offline_packages; then
|
||
sudo apt-get install -y containerd
|
||
fi
|
||
|
||
archive="nerdctl-full-${NERDCTL_VERSION}-linux-amd64.tar.gz"
|
||
if [ -f "${OFFLINE_DIR}/${archive}" ]; then
|
||
tmpdir="$OFFLINE_DIR"
|
||
else
|
||
tmpdir=$(mktemp -d)
|
||
url="https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/${archive}"
|
||
echo "🔽 下载 nerdctl: $url"
|
||
proxy_curl -fLo "${tmpdir}/${archive}" "$url"
|
||
fi
|
||
|
||
echo "📅 解压 nerdctl 到 /usr/local"
|
||
sudo tar -xzf "${tmpdir}/${archive}" -C /usr/local
|
||
|
||
sudo mkdir -p /etc/containerd
|
||
sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null
|
||
sudo systemctl enable --now containerd
|
||
nerdctl --version && echo "✅ nerdctl 安装成功" || echo "❌ nerdctl 安装失败"
|
||
}
|
||
|
||
install_nvidia() {
|
||
echo "[3/8] 安装 NVIDIA 驱动和容器工具"
|
||
distribution="ubuntu22.04"
|
||
if [ -f "${OFFLINE_DIR}/nvidia-gpgkey" ]; then
|
||
sudo install -m 0644 "${OFFLINE_DIR}/nvidia-gpgkey" /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||
else
|
||
proxy_curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | \
|
||
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||
fi
|
||
|
||
if ! install_all_offline_packages; then
|
||
proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \
|
||
sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \
|
||
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
|
||
sudo apt-get update -y
|
||
sudo apt-get install -y ${NVIDIA_DRIVER_VERSION} nvidia-container-toolkit
|
||
fi
|
||
if [ "$DEPLOY_MODE" = "sealos" ]; then
|
||
sudo nvidia-ctk runtime configure \
|
||
--config /var/lib/sealos/data/default/rootfs/etc/containerd/config.toml \
|
||
--set-as-default
|
||
sudo systemctl restart sealos-containerd
|
||
else
|
||
sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default
|
||
sudo systemctl restart containerd
|
||
fi
|
||
if ! command -v nvidia-smi >/dev/null; then echo "❌ nvidia-smi 未找到"; exit 1; fi
|
||
nvidia-smi || { echo "❌ NVIDIA 驱动有问题"; exit 1; }
|
||
}
|
||
|
||
install_sealos() {
|
||
echo "[4/8] 安装 Sealos"
|
||
if command -v sealos &>/dev/null; then
|
||
return
|
||
fi
|
||
if [ -f "${OFFLINE_DIR}/sealos_5.0.1_linux_amd64.deb" ]; then
|
||
sudo dpkg -i "${OFFLINE_DIR}/sealos_5.0.1_linux_amd64.deb"
|
||
else
|
||
proxy_curl -sfL https://raw.githubusercontent.com/labring/sealos/main/scripts/install.sh | bash
|
||
fi
|
||
}
|
||
|
||
install_kubeadm() {
|
||
echo "📦 安装 kubeadm、kubelet、kubectl"
|
||
|
||
# 默认安装版本,可传参覆盖
|
||
local KUBE_VERSION="${1:-1.29.9}"
|
||
|
||
echo "➡️ 目标版本: $KUBE_VERSION"
|
||
|
||
if [ -d "${OFFLINE_DIR}/packages" ] && ls "${OFFLINE_DIR}/packages"/kubeadm_* &>/dev/null; then
|
||
echo "📦 使用离线 deb 安装 kubeadm/kubelet/kubectl"
|
||
sudo dpkg -i "${OFFLINE_DIR}/packages"/kube{adm,let,ctl}_*${KUBE_VERSION}-00*.deb 2>/dev/null || \
|
||
sudo apt-get -f install -y
|
||
sudo apt-mark hold kubelet kubeadm kubectl
|
||
sudo systemctl enable --now kubelet
|
||
echo "✅ kubeadm/kubelet/kubectl 离线安装完成(版本 ${KUBE_VERSION})"
|
||
return
|
||
fi
|
||
|
||
# 在线安装
|
||
sudo apt-get update
|
||
sudo apt-get install -y apt-transport-https ca-certificates curl gpg
|
||
|
||
sudo mkdir -p /etc/apt/keyrings
|
||
if [ -f "${OFFLINE_DIR}/kubernetes-archive-keyring.gpg" ]; then
|
||
sudo cp "${OFFLINE_DIR}/kubernetes-archive-keyring.gpg" /etc/apt/keyrings/
|
||
else
|
||
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | \
|
||
sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-archive-keyring.gpg
|
||
fi
|
||
|
||
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-archive-keyring.gpg] \
|
||
https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /" | \
|
||
sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||
|
||
sudo apt-get update
|
||
|
||
# 安装指定版本的 kubeadm/kubelet/kubectl
|
||
sudo apt-get install -y \
|
||
kubelet=${KUBE_VERSION}-00 \
|
||
kubeadm=${KUBE_VERSION}-00 \
|
||
kubectl=${KUBE_VERSION}-00
|
||
|
||
sudo apt-mark hold kubelet kubeadm kubectl
|
||
|
||
echo "✅ kubeadm/kubelet/kubectl 安装完成(版本 ${KUBE_VERSION})"
|
||
|
||
# 启动 kubelet(kubeadm init 后才会真正激活)
|
||
sudo systemctl enable --now kubelet
|
||
}
|
||
|
||
setup_ssh() {
|
||
echo "[5/8] 配置 SSH 免密"
|
||
[ ! -f "${SSH_KEY}" ] && ssh-keygen -f "${SSH_KEY}" -N ""
|
||
cat "${SSH_KEY}.pub" >> ~/.ssh/authorized_keys
|
||
chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh
|
||
sudo systemctl enable --now ssh || sudo systemctl enable --now sshd
|
||
}
|
||
|
||
# === 修改的 deploy_k8s 支持 sealos / kubeadm 两种模式 ===
|
||
deploy_k8s() {
|
||
echo "[6/8] 部署 Kubernetes,模式: $DEPLOY_MODE"
|
||
MASTER_IP=$(hostname -I | awk '{print $1}')
|
||
|
||
if [ "$DEPLOY_MODE" = "sealos" ]; then
|
||
echo "[6.0] 使用 Sealos 部署 Kubernetes"
|
||
load_offline_images || true
|
||
sealos run "$K8S_VERSION" "$CILIUM_VERSION" "$HELM_VERSION" \
|
||
--masters "$MASTER_IP" --user "$USER" --pk "$SSH_KEY" \
|
||
--env '{}' --cmd "kubeadm init --skip-phases=addon/kube-proxy"
|
||
else
|
||
echo "[6.0] 使用 kubeadm 初始化 K8s master 节点"
|
||
sudo kubeadm init \
|
||
--pod-network-cidr=10.42.0.0/16 \
|
||
--apiserver-advertise-address="$MASTER_IP" \
|
||
--skip-phases=addon/kube-proxy
|
||
|
||
mkdir -p $HOME/.kube
|
||
sudo cp -f /etc/kubernetes/admin.conf $HOME/.kube/config
|
||
sudo chown "$(id -u):$(id -g)" $HOME/.kube/config
|
||
|
||
echo "[6.1] 安装 Cilium 网络插件(无 kube-proxy 模式)"
|
||
helm repo add cilium https://helm.cilium.io/ || true
|
||
helm repo update
|
||
helm install cilium cilium/cilium --version "${CILIUM_VERSION:-1.13.4}" \
|
||
--namespace kube-system \
|
||
--set kubeProxyReplacement=true \
|
||
--set k8sServiceHost="$MASTER_IP" \
|
||
--set k8sServicePort=6443
|
||
fi
|
||
|
||
echo "[6.2] Kubernetes 部署完成 ✅"
|
||
}
|
||
|
||
deploy_plugin() {
|
||
echo "[7/8] 部署 NVIDIA Device Plugin"
|
||
local plugin_file="${OFFLINE_DIR}/nvidia-device-plugin.yml"
|
||
if [ -f "$plugin_file" ]; then
|
||
kubectl apply -f "$plugin_file"
|
||
else
|
||
plugin_url="https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml"
|
||
if [ "$USE_PROXY" = true ]; then
|
||
HTTPS_PROXY=$PROXY_ADDR HTTP_PROXY=$PROXY_ADDR \
|
||
kubectl apply -f "$plugin_url"
|
||
else
|
||
kubectl apply -f "$plugin_url"
|
||
fi
|
||
fi
|
||
sleep 15
|
||
kubectl -n kube-system get pods | grep nvidia || echo "⚠️ 插件未启动"
|
||
kubectl describe node | grep -A10 Capacity | grep -i nvidia
|
||
}
|
||
|
||
run_test() {
|
||
echo "[8/8] 运行 CUDA vectoradd GPU 测试"
|
||
kubectl apply -f - <<'YAML'
|
||
apiVersion: v1
|
||
kind: Pod
|
||
metadata:
|
||
name: gpu-pod
|
||
spec:
|
||
restartPolicy: Never
|
||
containers:
|
||
- name: cuda-test
|
||
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
|
||
resources:
|
||
limits:
|
||
nvidia.com/gpu: 1
|
||
tolerations:
|
||
- key: nvidia.com/gpu
|
||
operator: Exists
|
||
effect: NoSchedule
|
||
YAML
|
||
kubectl wait pod/gpu-pod --for=condition=Succeeded --timeout=120s || true
|
||
kubectl logs gpu-pod || echo "⚠️ 未获取日志"
|
||
}
|
||
|
||
show_help() {
|
||
echo -e "用法: ./gpu-k8s.sh [阶段参数...]\n"
|
||
echo "可用阶段:"
|
||
echo " --install-base 安装基础依赖"
|
||
echo " --install-containerd 安装 containerd + nerdctl"
|
||
echo " --install-nvidia 安装 NVIDIA 驱动和工具"
|
||
echo " --install-sealos 安装 Sealos"
|
||
echo " --install-kubeadm 安装 kubeadm/kubelet/kubectl"
|
||
echo " --setup-ssh 配置 SSH 免密"
|
||
echo " --load_offline_images 导入离线镜像"
|
||
echo " --deploy-k8s 部署 Kubernetes(支持 sealos/kubeadm)"
|
||
echo " --deploy-plugin 部署 NVIDIA Device Plugin"
|
||
echo " --run-test 运行 GPU 测试"
|
||
echo " --all 全部步骤执行"
|
||
echo ""
|
||
echo "环境变量:"
|
||
echo " OFFLINE_DIR 指定离线包解压目录,默认为脚本所在目录"
|
||
echo " DEPLOY_MODE 设置部署模式(kubeadm 或 sealos,默认 kubeadm)"
|
||
echo " IMAGE_LOAD_TOOL 选择加载镜像的工具(sealos|nerdctl|docker,默认 sealos)"
|
||
echo -e "\n示例命令\t\t\t说明"
|
||
echo "USE_PROXY=true ./gpu-k8s.sh --install-nvidia # 只安装 NVIDIA 工具包并走代理"
|
||
echo "DEPLOY_MODE=sealos ./gpu-k8s.sh --deploy-k8s # 使用 sealos 部署 K8s"
|
||
echo "USE_PROXY=false ./gpu-k8s.sh --all # 全流程执行但不使用代理"
|
||
echo "OFFLINE_DIR=/path/to/offline DEPLOY_MODE=sealos ./gpu-k8s.sh --all # 使用离线包运行"
|
||
echo "IMAGE_LOAD_TOOL=nerdctl ./gpu-k8s.sh --load_offline_images # 选择 nerdctl 导入镜像"
|
||
}
|
||
|
||
# === 执行 ===
|
||
configure_proxy
|
||
|
||
if [ $# -eq 0 ]; then
|
||
show_help
|
||
exit 1
|
||
fi
|
||
|
||
for arg in "$@"; do
|
||
case "$arg" in
|
||
--install-base) install_base ;;
|
||
--install-containerd) install_containerd ;;
|
||
--install-nvidia) install_nvidia ;;
|
||
--install-sealos) install_sealos ;;
|
||
--install-kubeadm) install_kubeadm ;;
|
||
--setup-ssh) setup_ssh ;;
|
||
--load_offline_images) load_offline_images ;;
|
||
--deploy-k8s) deploy_k8s ;;
|
||
--deploy-plugin) deploy_plugin ;;
|
||
--run-test) run_test ;;
|
||
--all)
|
||
install_base
|
||
install_containerd
|
||
install_nvidia
|
||
install_sealos
|
||
if [ "$DEPLOY_MODE" = "kubeadm" ]; then
|
||
install_kubeadm
|
||
fi
|
||
setup_ssh
|
||
deploy_k8s
|
||
deploy_plugin
|
||
run_test
|
||
;;
|
||
*) show_help; exit 1 ;;
|
||
esac
|
||
echo
|
||
done
|